Loading...
Searching...
No Matches
TensorConvolution.h
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5//
6// This Source Code Form is subject to the terms of the Mozilla
7// Public License v. 2.0. If a copy of the MPL was not distributed
8// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
10#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
11#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
12
13namespace Eigen {
14
22namespace internal {
23
24template <typename Index, typename InputDims, int NumKernelDims, int Layout>
25class IndexMapper {
26 public:
27 IndexMapper(const InputDims& input_dims, const array<Index, NumKernelDims>& kernel_dims,
28 const array<Index, NumKernelDims>& indices) {
29
30 array<Index, NumDims> dimensions = input_dims;
31 for (int i = 0; i < NumKernelDims; ++i) {
32 const Index index = indices[i];
33 const Index input_dim = input_dims[index];
34 const Index kernel_dim = kernel_dims[i];
35 const Index result_dim = input_dim - kernel_dim + 1;
36 dimensions[index] = result_dim;
37 }
38
39 array<Index, NumDims> inputStrides;
40 array<Index, NumDims> outputStrides;
41 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
42 inputStrides[0] = 1;
43 outputStrides[0] = 1;
44 for (int i = 1; i < NumDims; ++i) {
45 inputStrides[i] = inputStrides[i-1] * input_dims[i-1];
46 outputStrides[i] = outputStrides[i-1] * dimensions[i-1];
47 }
48 } else {
49 inputStrides[NumDims - 1] = 1;
50 outputStrides[NumDims - 1] = 1;
51 for (int i = static_cast<int>(NumDims) - 2; i >= 0; --i) {
52 inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1];
53 outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1];
54 }
55 }
56
57 array<Index, NumDims> cudaInputDimensions;
58 array<Index, NumDims> cudaOutputDimensions;
59 array<Index, NumDims> tmp = dimensions;
60 array<Index, NumDims> ordering;
61 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
62 ? 0
63 : NumDims - NumKernelDims;
64 for (int i = 0; i < NumKernelDims; ++i) {
65 const Index index = i + offset;
66 ordering[index] = indices[i];
67 tmp[indices[i]] = -1;
68 cudaInputDimensions[index] = input_dims[indices[i]];
69 cudaOutputDimensions[index] = dimensions[indices[i]];
70 }
71
72 int written = static_cast<int>(Layout) == static_cast<int>(ColMajor)
73 ? NumKernelDims
74 : 0;
75 for (int i = 0; i < NumDims; ++i) {
76 if (tmp[i] >= 0) {
77 ordering[written] = i;
78 cudaInputDimensions[written] = input_dims[i];
79 cudaOutputDimensions[written] = dimensions[i];
80 ++written;
81 }
82 }
83
84 for (int i = 0; i < NumDims; ++i) {
85 m_inputStrides[i] = inputStrides[ordering[i]];
86 m_outputStrides[i] = outputStrides[ordering[i]];
87 }
88
89 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
90 for (int i = 0; i < NumDims; ++i) {
91 if (i > NumKernelDims) {
92 m_cudaInputStrides[i] =
93 m_cudaInputStrides[i - 1] * cudaInputDimensions[i - 1];
94 m_cudaOutputStrides[i] =
95 m_cudaOutputStrides[i - 1] * cudaOutputDimensions[i - 1];
96 } else {
97 m_cudaInputStrides[i] = 1;
98 m_cudaOutputStrides[i] = 1;
99 }
100 }
101 } else {
102 for (int i = NumDims - 1; i >= 0; --i) {
103 if (i + 1 < offset) {
104 m_cudaInputStrides[i] =
105 m_cudaInputStrides[i + 1] * cudaInputDimensions[i + 1];
106 m_cudaOutputStrides[i] =
107 m_cudaOutputStrides[i + 1] * cudaOutputDimensions[i + 1];
108 } else {
109 m_cudaInputStrides[i] = 1;
110 m_cudaOutputStrides[i] = 1;
111 }
112 }
113 }
114 }
115
116 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputPlaneToTensorInputOffset(Index p) const {
117 Index inputIndex = 0;
118 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
119 for (int d = NumDims - 1; d > NumKernelDims; --d) {
120 const Index idx = p / m_cudaInputStrides[d];
121 inputIndex += idx * m_inputStrides[d];
122 p -= idx * m_cudaInputStrides[d];
123 }
124 inputIndex += p * m_inputStrides[NumKernelDims];
125 } else {
126 std::ptrdiff_t limit = 0;
127 if (NumKernelDims < NumDims) {
128 limit = NumDims - NumKernelDims - 1;
129 }
130 for (int d = 0; d < limit; ++d) {
131 const Index idx = p / m_cudaInputStrides[d];
132 inputIndex += idx * m_inputStrides[d];
133 p -= idx * m_cudaInputStrides[d];
134 }
135 inputIndex += p * m_inputStrides[limit];
136 }
137 return inputIndex;
138 }
139
140 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputPlaneToTensorOutputOffset(Index p) const {
141 Index outputIndex = 0;
142 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
143 for (int d = NumDims - 1; d > NumKernelDims; --d) {
144 const Index idx = p / m_cudaOutputStrides[d];
145 outputIndex += idx * m_outputStrides[d];
146 p -= idx * m_cudaOutputStrides[d];
147 }
148 outputIndex += p * m_outputStrides[NumKernelDims];
149 } else {
150 std::ptrdiff_t limit = 0;
151 if (NumKernelDims < NumDims) {
152 limit = NumDims - NumKernelDims - 1;
153 }
154 for (int d = 0; d < limit; ++d) {
155 const Index idx = p / m_cudaOutputStrides[d];
156 outputIndex += idx * m_outputStrides[d];
157 p -= idx * m_cudaOutputStrides[d];
158 }
159 outputIndex += p * m_outputStrides[limit];
160 }
161 return outputIndex;
162 }
163
164 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i) const {
165 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
166 ? 0
167 : NumDims - NumKernelDims;
168 return i * m_inputStrides[offset];
169 }
170
171 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i) const {
172 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
173 ? 0
174 : NumDims - NumKernelDims;
175 return i * m_outputStrides[offset];
176 }
177
178 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j) const {
179 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
180 ? 0
181 : NumDims - NumKernelDims;
182 return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1];
183 }
184
185 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j) const {
186 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
187 ? 0
188 : NumDims - NumKernelDims;
189 return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1];
190 }
191
192 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j, Index k) const {
193 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
194 ? 0
195 : NumDims - NumKernelDims;
196 return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] +
197 k * m_inputStrides[offset + 2];
198 }
199
200 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const {
201 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
202 ? 0
203 : NumDims - NumKernelDims;
204 return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] +
205 k * m_outputStrides[offset + 2];
206 }
207
208 private:
209 static const int NumDims = internal::array_size<InputDims>::value;
210 array<Index, NumDims> m_inputStrides;
211 array<Index, NumDims> m_outputStrides;
212 array<Index, NumDims> m_cudaInputStrides;
213 array<Index, NumDims> m_cudaOutputStrides;
214};
215
216
217
218template<typename Dimensions, typename InputXprType, typename KernelXprType>
219struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >
220{
221 // Type promotion to handle the case where the types of the lhs and the rhs are different.
222 typedef typename promote_storage_type<typename InputXprType::Scalar,
223 typename KernelXprType::Scalar>::ret Scalar;
224 typedef typename promote_storage_type<typename traits<InputXprType>::StorageKind,
225 typename traits<KernelXprType>::StorageKind>::ret StorageKind;
226 typedef typename promote_index_type<typename traits<InputXprType>::Index,
227 typename traits<KernelXprType>::Index>::type Index;
228 typedef typename InputXprType::Nested LhsNested;
229 typedef typename KernelXprType::Nested RhsNested;
230 typedef typename remove_reference<LhsNested>::type _LhsNested;
231 typedef typename remove_reference<RhsNested>::type _RhsNested;
232 static const int NumDimensions = traits<InputXprType>::NumDimensions;
233 static const int Layout = traits<InputXprType>::Layout;
234
235 enum {
236 Flags = 0
237 };
238};
239
240template<typename Dimensions, typename InputXprType, typename KernelXprType>
241struct eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, Eigen::Dense>
242{
243 typedef const TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>& type;
244};
245
246template<typename Dimensions, typename InputXprType, typename KernelXprType>
247struct nested<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, 1, typename eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >::type>
248{
249 typedef TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> type;
250};
251
252} // end namespace internal
253
254
255
256template<typename Indices, typename InputXprType, typename KernelXprType>
257class TensorConvolutionOp : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType>, ReadOnlyAccessors>
258{
259 public:
260 typedef typename Eigen::internal::traits<TensorConvolutionOp>::Scalar Scalar;
261 typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
262 typedef typename internal::promote_storage_type<typename InputXprType::CoeffReturnType,
263 typename KernelXprType::CoeffReturnType>::ret CoeffReturnType;
264 typedef typename Eigen::internal::nested<TensorConvolutionOp>::type Nested;
265 typedef typename Eigen::internal::traits<TensorConvolutionOp>::StorageKind StorageKind;
266 typedef typename Eigen::internal::traits<TensorConvolutionOp>::Index Index;
267
268 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel, const Indices& dims)
269 : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {}
270
271 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
272 const Indices& indices() const { return m_indices; }
273
275 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
276 const typename internal::remove_all<typename InputXprType::Nested>::type&
277 inputExpression() const { return m_input_xpr; }
278
279 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
280 const typename internal::remove_all<typename KernelXprType::Nested>::type&
281 kernelExpression() const { return m_kernel_xpr; }
282
283 protected:
284 typename InputXprType::Nested m_input_xpr;
285 typename KernelXprType::Nested m_kernel_xpr;
286 const Indices m_indices;
287};
288
289
290template<typename Indices, typename InputArgType, typename KernelArgType, typename Device>
291struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Device>
292{
293 typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
294
295 static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, Device>::Dimensions>::value;
296 static const int NumKernelDims = internal::array_size<Indices>::value;
297 typedef typename XprType::Index Index;
298 typedef DSizes<Index, NumDims> Dimensions;
299
300 typedef typename XprType::Scalar Scalar;
301 typedef typename XprType::CoeffReturnType CoeffReturnType;
302 typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
303 static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
304
305 enum {
306 IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned,
307 PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess,
308 Layout = TensorEvaluator<InputArgType, Device>::Layout,
309 CoordAccess = false, // to be implemented
310 RawAccess = false
311 };
312
313 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
314 : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device)
315 {
316 EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
317
318 const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions();
319 const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
320
321 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
322 m_inputStride[0] = 1;
323 for (int i = 1; i < NumDims; ++i) {
324 m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1];
325 }
326 } else {
327 m_inputStride[NumDims - 1] = 1;
328 for (int i = NumDims - 2; i >= 0; --i) {
329 m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1];
330 }
331 }
332
333 m_dimensions = m_inputImpl.dimensions();
334 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
335 for (int i = 0; i < NumKernelDims; ++i) {
336 const Index index = op.indices()[i];
337 const Index input_dim = input_dims[index];
338 const Index kernel_dim = kernel_dims[i];
339 const Index result_dim = input_dim - kernel_dim + 1;
340 m_dimensions[index] = result_dim;
341 if (i > 0) {
342 m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1];
343 } else {
344 m_kernelStride[0] = 1;
345 }
346 m_indexStride[i] = m_inputStride[index];
347 }
348
349 m_outputStride[0] = 1;
350 for (int i = 1; i < NumDims; ++i) {
351 m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1];
352 }
353 } else {
354 for (int i = NumKernelDims - 1; i >= 0; --i) {
355 const Index index = op.indices()[i];
356 const Index input_dim = input_dims[index];
357 const Index kernel_dim = kernel_dims[i];
358 const Index result_dim = input_dim - kernel_dim + 1;
359 m_dimensions[index] = result_dim;
360 if (i < NumKernelDims - 1) {
361 m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1];
362 } else {
363 m_kernelStride[NumKernelDims - 1] = 1;
364 }
365 m_indexStride[i] = m_inputStride[index];
366 }
367
368 m_outputStride[NumDims - 1] = 1;
369 for (int i = NumDims - 2; i >= 0; --i) {
370 m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1];
371 }
372 }
373 }
374
375 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
376
377 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
378 m_inputImpl.evalSubExprsIfNeeded(NULL);
379 preloadKernel();
380 return true;
381 }
382 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
383 m_inputImpl.cleanup();
384 if (m_local_kernel) {
385 m_device.deallocate((void*)m_kernel);
386 m_local_kernel = false;
387 }
388 m_kernel = NULL;
389 }
390
391 void evalTo(typename XprType::Scalar* buffer) {
392 evalSubExprsIfNeeded(NULL);
393 for (int i = 0; i < dimensions().TotalSize(); ++i) {
394 buffer[i] += coeff(i);
395 }
396 cleanup();
397 }
398
399 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
400 {
401 CoeffReturnType result = CoeffReturnType(0);
402 convolve(firstInput(index), 0, NumKernelDims-1, result);
403 return result;
404 }
405
406 template<int LoadMode>
407 EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const
408 {
409 Index indices[2] = {index, index+PacketSize-1};
410 Index startInputs[2] = {0, 0};
411 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
412 for (int i = NumDims - 1; i > 0; --i) {
413 const Index idx0 = indices[0] / m_outputStride[i];
414 const Index idx1 = indices[1] / m_outputStride[i];
415 startInputs[0] += idx0 * m_inputStride[i];
416 startInputs[1] += idx1 * m_inputStride[i];
417 indices[0] -= idx0 * m_outputStride[i];
418 indices[1] -= idx1 * m_outputStride[i];
419 }
420 } else {
421 for (int i = 0; i < NumDims - 1; ++i) {
422 const Index idx0 = indices[0] / m_outputStride[i];
423 const Index idx1 = indices[1] / m_outputStride[i];
424 startInputs[0] += idx0 * m_inputStride[i];
425 startInputs[1] += idx1 * m_inputStride[i];
426 indices[0] -= idx0 * m_outputStride[i];
427 indices[1] -= idx1 * m_outputStride[i];
428 }
429 }
430 startInputs[0] += indices[0];
431 startInputs[1] += indices[1];
432
433 if (startInputs[1]-startInputs[0] == PacketSize-1) {
434 PacketReturnType result = internal::pset1<PacketReturnType>(0);
435 convolvePacket(startInputs[0], 0, NumKernelDims-1, result);
436 return result;
437 } else {
438 EIGEN_ALIGN_MAX Scalar data[PacketSize];
439 data[0] = Scalar(0);
440 convolve(startInputs[0], 0, NumKernelDims-1, data[0]);
441 for (int i = 1; i < PacketSize-1; ++i) {
442 data[i] = Scalar(0);
443 convolve(firstInput(index+i), 0, NumKernelDims-1, data[i]);
444 }
445 data[PacketSize-1] = Scalar(0);
446 convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]);
447 return internal::pload<PacketReturnType>(data);
448 }
449 }
450
451 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
452 costPerCoeff(bool vectorized) const {
453 const double kernel_size = m_kernelImpl.dimensions().TotalSize();
454 // We ignore the use of fused multiply-add.
455 const double convolve_compute_cost =
456 TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
457 const double firstIndex_compute_cost =
458 NumDims *
459 (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
460 TensorOpCost::DivCost<Index>());
461 return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
462 kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
463 m_kernelImpl.costPerCoeff(vectorized) +
464 TensorOpCost(0, 0, convolve_compute_cost, vectorized,
465 PacketSize));
466 }
467
468 EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
469
470 private:
471 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
472 Index startInput = 0;
473 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
474 for (int i = NumDims - 1; i > 0; --i) {
475 const Index idx = index / m_outputStride[i];
476 startInput += idx * m_inputStride[i];
477 index -= idx * m_outputStride[i];
478 }
479 } else {
480 for (int i = 0; i < NumDims - 1; ++i) {
481 const Index idx = index / m_outputStride[i];
482 startInput += idx * m_inputStride[i];
483 index -= idx * m_outputStride[i];
484 }
485 }
486 startInput += index;
487 return startInput;
488 }
489
490 EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const {
491 for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
492 const Index input = firstIndex + j * m_indexStride[DimIndex];
493 const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
494 if (DimIndex > 0) {
495 convolve(input, kernel, DimIndex-1, accum);
496 } else {
497 accum += m_inputImpl.coeff(input) * m_kernel[kernel];
498 }
499 }
500 }
501
502 template <typename Packet>
503 EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const {
504 for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
505 const Index input = firstIndex + j * m_indexStride[DimIndex];
506 const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
507 if (DimIndex > 0) {
508 convolvePacket(input, kernel, DimIndex-1, accum);
509 } else {
510 accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input), internal::pset1<Packet>(m_kernel[kernel]), accum);
511 }
512 }
513 }
514
515 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
516 // Don't make a local copy of the kernel unless we have to (i.e. it's an
517 // expression that needs to be evaluated)
518 const Scalar* in_place = m_kernelImpl.data();
519 if (in_place) {
520 m_kernel = in_place;
521 m_local_kernel = false;
522 } else {
523 size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
524 Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
525 typedef TensorEvalToOp<const KernelArgType> EvalTo;
526 EvalTo evalToTmp(local, m_kernelArg);
527 const bool PacketAccess = internal::IsVectorizable<Device, KernelArgType>::value;
528 internal::TensorExecutor<const EvalTo, Device, PacketAccess>::run(evalToTmp, m_device);
529
530 m_kernel = local;
531 m_local_kernel = true;
532 }
533 }
534
535 array<Index, NumDims> m_inputStride;
536 array<Index, NumDims> m_outputStride;
537
538 array<Index, NumKernelDims> m_indexStride;
539 array<Index, NumKernelDims> m_kernelStride;
540 TensorEvaluator<InputArgType, Device> m_inputImpl;
541 TensorEvaluator<KernelArgType, Device> m_kernelImpl;
542 Dimensions m_dimensions;
543
544 KernelArgType m_kernelArg;
545 const Scalar* m_kernel;
546 bool m_local_kernel;
547 const Device& m_device;
548};
549
550
551
552
553// Use an optimized implementation of the evaluation code for GPUs whenever possible.
554#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
555
556template <int StaticKernelSize>
557struct GetKernelSize {
558 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int /*kernelSize*/) const {
559 return StaticKernelSize;
560 }
561};
562template <>
563struct GetKernelSize<Dynamic> {
564 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int kernelSize) const {
565 return kernelSize;
566 }
567};
568
569template <typename InputEvaluator, typename Index, typename InputDims,
570 int StaticKernelSize>
571__global__ void EigenConvolutionKernel1D(
572 InputEvaluator eval,
573 const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout>
574 indexMapper,
575 const float* __restrict kernel, const int numPlanes, const int numX,
576 const int maxX, const int kernelSize, float* buffer) {
577 extern __shared__ float s[];
578
579 const int first_x = blockIdx.x * maxX;
580 const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
581 const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize);
582 const int num_x_output = last_x - first_x + 1;
583
584 const int first_plane = blockIdx.y * blockDim.y;
585 const int plane_stride = blockDim.y * gridDim.y;
586
587 for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) {
588 // Load inputs to shared memory
589 const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
590 const int plane_kernel_offset = threadIdx.y * num_x_input;
591 #pragma unroll
592 for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
593 const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x);
594 s[i + plane_kernel_offset] = eval.coeff(tensor_index);
595 }
596
597 __syncthreads();
598
599 // Compute the convolution
600 const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
601
602 #pragma unroll
603 for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
604 const int kernel_offset = plane_kernel_offset + i;
605 float result = 0.0f;
606 #pragma unroll
607 for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) {
608 result += s[k + kernel_offset] * kernel[k];
609 }
610 const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x);
611 buffer[tensor_index] = result;
612 }
613 __syncthreads();
614 }
615};
616
617template <typename InputEvaluator, typename Index, typename InputDims,
618 int StaticKernelSizeX, int StaticKernelSizeY>
619__global__ void EigenConvolutionKernel2D(
620 InputEvaluator eval,
621 const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout>
622 indexMapper,
623 const float* __restrict kernel, const int numPlanes, const int numX,
624 const int maxX, const int numY, const int maxY, const int kernelSizeX,
625 const int kernelSizeY, float* buffer) {
626 extern __shared__ float s[];
627
628 const int first_x = blockIdx.x * maxX;
629 const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
630 const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX);
631 const int num_x_output = last_x - first_x + 1;
632
633 const int first_y = blockIdx.y * maxY;
634 const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
635 const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY);
636 const int num_y_output = last_y - first_y + 1;
637
638 const int first_plane = blockIdx.z * blockDim.z;
639 const int plane_stride = blockDim.z * gridDim.z;
640
641 for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) {
642
643 const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
644 const int plane_kernel_offset = threadIdx.z * num_y_input;
645
646 // Load inputs to shared memory
647 #pragma unroll
648 for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
649 const int input_offset = num_x_input * (j + plane_kernel_offset);
650 #pragma unroll
651 for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
652 const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y);
653 s[i + input_offset] = eval.coeff(tensor_index);
654 }
655 }
656
657 __syncthreads();
658
659 // Convolution
660 const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
661
662 #pragma unroll
663 for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
664 #pragma unroll
665 for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
666 float result = 0.0f;
667 #pragma unroll
668 for (int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++l) {
669 const int kernel_offset = kernelSizeX * l;
670 const int input_offset = i + num_x_input * (j + l + plane_kernel_offset);
671 #pragma unroll
672 for (int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++k) {
673 result += s[k + input_offset] * kernel[k + kernel_offset];
674 }
675 }
676 const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y);
677 buffer[tensor_index] = result;
678 }
679 }
680
681 __syncthreads();
682 }
683};
684
685template <typename InputEvaluator, typename Index, typename InputDims>
686__global__ void EigenConvolutionKernel3D(
687 InputEvaluator eval,
688 const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout>
689 indexMapper,
690 const float* __restrict kernel, const size_t numPlanes, const size_t numX,
691 const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ,
692 const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY,
693 const size_t kernelSizeZ, float* buffer) {
694 extern __shared__ float s[];
695
696 // Load inputs to shared memory
697 const int first_x = blockIdx.x * maxX;
698 const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
699 const int num_x_input = last_x - first_x + kernelSizeX;
700
701 const int first_y = blockIdx.y * maxY;
702 const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
703 const int num_y_input = last_y - first_y + kernelSizeY;
704
705 const int first_z = blockIdx.z * maxZ;
706 const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
707 const int num_z_input = last_z - first_z + kernelSizeZ;
708
709 for (int p = 0; p < numPlanes; ++p) {
710
711 const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
712 const int plane_kernel_offset = 0;
713
714 for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) {
715 for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
716 for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
717 const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z);
718 s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
719 }
720 }
721 }
722
723 __syncthreads();
724
725 // Convolution
726 const int num_z_output = last_z - first_z + 1;
727 const int num_y_output = last_y - first_y + 1;
728 const int num_x_output = last_x - first_x + 1;
729 const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
730
731 for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) {
732 for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
733 for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
734 float result = 0.0f;
735 for (int n = 0; n < kernelSizeZ; ++n) {
736 for (int m = 0; m < kernelSizeY; ++m) {
737 for (int l = 0; l < kernelSizeX; ++l) {
738 result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)];
739 }
740 }
741 }
742 const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z);
743 buffer[tensor_index] = result;
744 }
745 }
746 }
747 __syncthreads();
748 }
749};
750
751
752
753template<typename Indices, typename InputArgType, typename KernelArgType>
754struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, GpuDevice>
755{
756 typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
757
758 static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions>::value;
759 static const int NumKernelDims = internal::array_size<Indices>::value;
760 typedef typename XprType::Index Index;
761 typedef DSizes<Index, NumDims> Dimensions;
762 typedef typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions KernelDimensions;
763
764 enum {
765 IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
766 PacketAccess = false,
767 Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout,
768 CoordAccess = false, // to be implemented
769 RawAccess = false
770 };
771
772 EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device)
773 : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
774 {
775 EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
776
777 const typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions& input_dims = m_inputImpl.dimensions();
778 const typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
779
780 m_dimensions = m_inputImpl.dimensions();
781 for (int i = 0; i < NumKernelDims; ++i) {
782 const Index index = op.indices()[i];
783 const Index input_dim = input_dims[index];
784 const Index kernel_dim = kernel_dims[i];
785 const Index result_dim = input_dim - kernel_dim + 1;
786 m_dimensions[index] = result_dim;
787 }
788 }
789
790 typedef typename XprType::CoeffReturnType CoeffReturnType;
791 typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType;
792 typedef typename InputArgType::Scalar Scalar;
793 static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
794
795 EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; }
796
797 EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
798 preloadKernel();
799 m_inputImpl.evalSubExprsIfNeeded(NULL);
800 if (data) {
801 executeEval(data);
802 return false;
803 } else {
804 m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar));
805 executeEval(m_buf);
806 return true;
807 }
808 }
809
810 EIGEN_STRONG_INLINE void cleanup() {
811 m_inputImpl.cleanup();
812 if (m_buf) {
813 m_device.deallocate(m_buf);
814 m_buf = NULL;
815 }
816 if (m_local_kernel) {
817 m_device.deallocate((void*)m_kernel);
818 m_local_kernel = false;
819 }
820 m_kernel = NULL;
821 }
822
823 EIGEN_STRONG_INLINE void preloadKernel() {
824 // Don't make a local copy of the kernel unless we have to (i.e. it's an
825 // expression that needs to be evaluated)
826 const Scalar* in_place = m_kernelImpl.data();
827 if (in_place) {
828 m_kernel = in_place;
829 m_local_kernel = false;
830 } else {
831 size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
832 Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
833 typedef TensorEvalToOp<const KernelArgType> EvalTo;
834 EvalTo evalToTmp(local, m_kernelArg);
835 const bool PacketAccess = internal::IsVectorizable<GpuDevice, KernelArgType>::value;
836 internal::TensorExecutor<const EvalTo, GpuDevice, PacketAccess>::run(evalToTmp, m_device);
837
838 m_kernel = local;
839 m_local_kernel = true;
840 }
841 }
842
843 static unsigned int ceil(unsigned int num, unsigned int denom) {
844 const unsigned int rounded_toward_zero = num / denom;
845 if (num > rounded_toward_zero * denom) {
846 return rounded_toward_zero + 1;
847 }
848 return rounded_toward_zero;
849 }
850
851 void executeEval(Scalar* data) const {
852 typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims;
853
854 const int maxSharedMem = m_device.sharedMemPerBlock();
855 const int maxThreadsPerBlock = m_device.maxCudaThreadsPerBlock();
856 const int maxBlocksPerProcessor = m_device.maxCudaThreadsPerMultiProcessor() / maxThreadsPerBlock;
857 const int numMultiProcessors = m_device.getNumCudaMultiProcessors();
858 const int warpSize = 32;
859
860 switch (NumKernelDims) {
861 case 1: {
862 const int kernel_size = m_kernelImpl.dimensions().TotalSize();
863
864 const int numX = dimensions()[m_indices[0]];
865 const int numP = dimensions().TotalSize() / numX;
866 int maxX;
867 dim3 block_size;
868
869 const int single_stride_dim =
870 static_cast<int>(Layout) == static_cast<int>(ColMajor)
871 ? 0
872 : m_inputImpl.dimensions().rank() - 1;
873 if (m_indices[0] == single_stride_dim) {
874 // Maximum the reuse
875 const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32;
876 maxX = numext::mini<int>(inner_dim, numX);
877 const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP);
878 block_size.x = numext::mini(maxThreadsPerBlock, maxX);
879 block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP);
880 }
881 else {
882 // Read as much as possible alongside the inner most dimension, that is the plane
883 const int inner_dim = maxSharedMem / ((warpSize + kernel_size) * sizeof(Scalar));
884 const int maxP = numext::mini<int>(inner_dim, numP);
885 maxX = numext::mini<int>(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX);
886
887 block_size.x = numext::mini(warpSize, maxX);
888 block_size.y = numext::mini<int>(maxThreadsPerBlock/block_size.x, maxP);
889 }
890
891 const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar);
892 assert(shared_mem <= maxSharedMem);
893
894 const int num_x_blocks = ceil(numX, maxX);
895 const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
896 const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks);
897
898 dim3 num_blocks(num_x_blocks, numext::mini<int>(num_y_blocks, ceil(numP, block_size.y)));
899
900
901 //cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
902
903 const array<Index, 1> indices(m_indices[0]);
904 const array<Index, 1> kernel_dims(m_kernelImpl.dimensions()[0]);
905 internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(
906 m_inputImpl.dimensions(), kernel_dims, indices);
907 switch(kernel_size) {
908 case 4: {
909 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data);
910 break;
911 }
912 case 7: {
913 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data);
914 break;
915 }
916 default: {
917 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data);
918 }
919 }
920 break;
921 }
922
923 case 2: {
924 const int idxX =
925 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1;
926 const int idxY =
927 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0;
928 const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
929 const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
930
931 const int numX = dimensions()[m_indices[idxX]];
932 const int numY = dimensions()[m_indices[idxY]];
933 const int numP = dimensions().TotalSize() / (numX*numY);
934
935 const float scaling_factor = sqrtf(static_cast<float>(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x));
936
937 // Snap maxX to warp size
938 int inner_dim = ((static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32;
939 const int maxX = numext::mini<int>(inner_dim, numX);
940 const int maxY = numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY);
941 const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP);
942
943 dim3 block_size;
944 block_size.x = numext::mini(1024, maxX);
945 block_size.y = numext::mini<int>(1024/block_size.x, maxY);
946 block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxP);
947
948 const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar);
949 assert(shared_mem <= maxSharedMem);
950
951 const int num_x_blocks = ceil(numX, maxX);
952 const int num_y_blocks = ceil(numY, maxY);
953 const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
954 const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks);
955
956 dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini<int>(num_z_blocks, ceil(numP, block_size.z)));
957
958
959 //cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
960
961 const array<Index, 2> indices(m_indices[idxX], m_indices[idxY]);
962 const array<Index, 2> kernel_dims(m_kernelImpl.dimensions()[idxX],
963 m_kernelImpl.dimensions()[idxY]);
964 internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(
965 m_inputImpl.dimensions(), kernel_dims, indices);
966 switch (kernel_size_x) {
967 case 4: {
968 switch (kernel_size_y) {
969 case 7: {
970 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data);
971 break;
972 }
973 default: {
974 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data);
975 break;
976 }
977 }
978 break;
979 }
980 case 7: {
981 switch (kernel_size_y) {
982 case 4: {
983 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data);
984 break;
985 }
986 default: {
987 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data);
988 break;
989 }
990 }
991 break;
992 }
993 default: {
994 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
995 break;
996 }
997 }
998 break;
999 }
1000
1001 case 3: {
1002 const int idxX =
1003 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2;
1004 const int idxY =
1005 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1;
1006 const int idxZ =
1007 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0;
1008
1009 const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
1010 const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
1011 const int kernel_size_z = m_kernelImpl.dimensions()[idxZ];
1012
1013 const int numX = dimensions()[m_indices[idxX]];
1014 const int numY = dimensions()[m_indices[idxY]];
1015 const int numZ = dimensions()[m_indices[idxZ]];
1016 const int numP = dimensions().TotalSize() / (numX*numY*numZ);
1017
1018 const int maxX = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX));
1019 const int maxY = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY));
1020 const int maxZ = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ));
1021
1022 dim3 block_size;
1023 block_size.x = numext::mini(32, maxX);
1024 block_size.y = numext::mini(32, maxY);
1025 block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxZ);
1026 dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ));
1027
1028 const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar);
1029 assert(shared_mem <= maxSharedMem);
1030
1031 //cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
1032 const array<Index, 3> indices(m_indices[idxX], m_indices[idxY],
1033 m_indices[idxZ]);
1034 const array<Index, 3> kernel_dims(m_kernelImpl.dimensions()[idxX],
1035 m_kernelImpl.dimensions()[idxY],
1036 m_kernelImpl.dimensions()[idxZ]);
1037 internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(
1038 m_inputImpl.dimensions(), kernel_dims, indices);
1039
1040 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
1041 break;
1042 }
1043
1044 default: {
1045 EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
1046 }
1047 }
1048 }
1049
1050 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
1051 {
1052 eigen_assert(m_buf);
1053 eigen_assert(index < m_dimensions.TotalSize());
1054 return m_buf[index];
1055 }
1056
1057 template<int LoadMode>
1058 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const
1059 {
1060 eigen_assert(m_buf);
1061 eigen_assert(index < m_dimensions.TotalSize());
1062 return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index);
1063 }
1064
1065 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
1066 costPerCoeff(bool vectorized) const {
1067 // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
1068 // model.
1069 const double kernel_size = m_kernelImpl.dimensions().TotalSize();
1070 // We ignore the use of fused multiply-add.
1071 const double convolve_compute_cost =
1072 TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
1073 const double firstIndex_compute_cost =
1074 NumDims *
1075 (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
1076 TensorOpCost::DivCost<Index>());
1077 return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
1078 kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
1079 m_kernelImpl.costPerCoeff(vectorized) +
1080 TensorOpCost(0, 0, convolve_compute_cost, vectorized,
1081 PacketSize));
1082 }
1083
1084 private:
1085 // No assignment (copies are needed by the kernels)
1086 TensorEvaluator& operator = (const TensorEvaluator&);
1087
1088 TensorEvaluator<InputArgType, GpuDevice> m_inputImpl;
1089 TensorEvaluator<KernelArgType, GpuDevice> m_kernelImpl;
1090 KernelArgType m_kernelArg;
1091 Indices m_indices;
1092 Dimensions m_dimensions;
1093 Scalar* m_buf;
1094 const Scalar* m_kernel;
1095 bool m_local_kernel;
1096
1097 const GpuDevice& m_device;
1098};
1099#endif
1100
1101
1102} // end namespace Eigen
1103
1104#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
Namespace containing all symbols from the Eigen library.
Definition: AdolcForward:45
const Device & device() const
required by sycl in order to construct sycl buffer from raw pointer
Definition: TensorEvaluator.h:114