Loading...
Searching...
No Matches
TensorExecutor.h
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5//
6// This Source Code Form is subject to the terms of the Mozilla
7// Public License v. 2.0. If a copy of the MPL was not distributed
8// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
10#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
11#define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
12
13namespace Eigen {
14
23namespace internal {
24
25// Default strategy: the expression is evaluated with a single cpu thread.
26template<typename Expression, typename Device, bool Vectorizable>
28{
29 public:
30 typedef typename Expression::Index Index;
31 EIGEN_DEVICE_FUNC
32 static inline void run(const Expression& expr, const Device& device = Device())
33 {
34 TensorEvaluator<Expression, Device> evaluator(expr, device);
35 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
36 if (needs_assign)
37 {
38 const Index size = array_prod(evaluator.dimensions());
39 for (Index i = 0; i < size; ++i) {
40 evaluator.evalScalar(i);
41 }
42 }
43 evaluator.cleanup();
44 }
45};
46
47
48template<typename Expression>
49class TensorExecutor<Expression, DefaultDevice, true>
50{
51 public:
52 typedef typename Expression::Index Index;
53 EIGEN_DEVICE_FUNC
54 static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice())
55 {
56 TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
57 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
58 if (needs_assign)
59 {
60 const Index size = array_prod(evaluator.dimensions());
61 const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
62 // Give the compiler a strong hint to unroll the loop. But don't insist
63 // on unrolling, because if the function is expensive the compiler should not
64 // unroll the loop at the expense of inlining.
65 const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize;
66 for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) {
67 for (Index j = 0; j < 4; j++) {
68 evaluator.evalPacket(i + j * PacketSize);
69 }
70 }
71 const Index VectorizedSize = (size / PacketSize) * PacketSize;
72 for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
73 evaluator.evalPacket(i);
74 }
75 for (Index i = VectorizedSize; i < size; ++i) {
76 evaluator.evalScalar(i);
77 }
78 }
79 evaluator.cleanup();
80 }
81};
82
83
84
85// Multicore strategy: the index space is partitioned and each partition is executed on a single core
86#ifdef EIGEN_USE_THREADS
87template <typename Evaluator, typename Index, bool Vectorizable>
88struct EvalRange {
89 static void run(Evaluator* evaluator_in, const Index first, const Index last) {
90 Evaluator evaluator = *evaluator_in;
91 eigen_assert(last >= first);
92 for (Index i = first; i < last; ++i) {
93 evaluator.evalScalar(i);
94 }
95 }
96
97 static Index alignBlockSize(Index size) {
98 return size;
99 }
100};
101
102template <typename Evaluator, typename Index>
103struct EvalRange<Evaluator, Index, true> {
104 static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
105
106 static void run(Evaluator* evaluator_in, const Index first, const Index last) {
107 Evaluator evaluator = *evaluator_in;
108 eigen_assert(last >= first);
109 Index i = first;
110 if (last - first >= PacketSize) {
111 eigen_assert(first % PacketSize == 0);
112 Index last_chunk_offset = last - 4 * PacketSize;
113 // Give the compiler a strong hint to unroll the loop. But don't insist
114 // on unrolling, because if the function is expensive the compiler should not
115 // unroll the loop at the expense of inlining.
116 for (; i <= last_chunk_offset; i += 4*PacketSize) {
117 for (Index j = 0; j < 4; j++) {
118 evaluator.evalPacket(i + j * PacketSize);
119 }
120 }
121 last_chunk_offset = last - PacketSize;
122 for (; i <= last_chunk_offset; i += PacketSize) {
123 evaluator.evalPacket(i);
124 }
125 }
126 for (; i < last; ++i) {
127 evaluator.evalScalar(i);
128 }
129 }
130
131 static Index alignBlockSize(Index size) {
132 // Align block size to packet size and account for unrolling in run above.
133 if (size >= 16 * PacketSize) {
134 return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1);
135 }
136 // Aligning to 4 * PacketSize would increase block size by more than 25%.
137 return (size + PacketSize - 1) & ~(PacketSize - 1);
138 }
139};
140
141template <typename Expression, bool Vectorizable>
142class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> {
143 public:
144 typedef typename Expression::Index Index;
145 static inline void run(const Expression& expr, const ThreadPoolDevice& device)
146 {
147 typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
148 Evaluator evaluator(expr, device);
149 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
150 if (needs_assign)
151 {
152 const Index size = array_prod(evaluator.dimensions());
153#if !defined(EIGEN_USE_SIMPLE_THREAD_POOL)
154 device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
155 EvalRange<Evaluator, Index, Vectorizable>::alignBlockSize,
156 [&evaluator](Index first, Index last) {
157 EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, first, last);
158 });
159#else
160 size_t num_threads = device.numThreads();
161 if (num_threads > 1) {
162 num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
163 size, evaluator.costPerCoeff(Vectorizable), num_threads);
164 }
165 if (num_threads == 1) {
166 EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, 0, size);
167 } else {
168 const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
169 Index blocksz = std::ceil<Index>(static_cast<float>(size)/num_threads) + PacketSize - 1;
170 const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
171 const Index numblocks = size / blocksize;
172
173 Barrier barrier(numblocks);
174 for (int i = 0; i < numblocks; ++i) {
175 device.enqueue_with_barrier(
176 &barrier, &EvalRange<Evaluator, Index, Vectorizable>::run,
177 &evaluator, i * blocksize, (i + 1) * blocksize);
178 }
179 if (numblocks * blocksize < size) {
180 EvalRange<Evaluator, Index, Vectorizable>::run(
181 &evaluator, numblocks * blocksize, size);
182 }
183 barrier.Wait();
184 }
185#endif // defined(!EIGEN_USE_SIMPLE_THREAD_POOL)
186 }
187 evaluator.cleanup();
188 }
189};
190#endif // EIGEN_USE_THREADS
191
192
193// GPU: the evaluation of the expression is offloaded to a GPU.
194#if defined(EIGEN_USE_GPU)
195
196template <typename Expression, bool Vectorizable>
197class TensorExecutor<Expression, GpuDevice, Vectorizable> {
198 public:
199 typedef typename Expression::Index Index;
200 static void run(const Expression& expr, const GpuDevice& device);
201};
202
203
204#if defined(__CUDACC__)
205template <typename Evaluator, typename Index, bool Vectorizable>
206struct EigenMetaKernelEval {
207 static __device__ EIGEN_ALWAYS_INLINE
208 void run(Evaluator& eval, Index first, Index last, Index step_size) {
209 for (Index i = first; i < last; i += step_size) {
210 eval.evalScalar(i);
211 }
212 }
213};
214
215template <typename Evaluator, typename Index>
216struct EigenMetaKernelEval<Evaluator, Index, true> {
217 static __device__ EIGEN_ALWAYS_INLINE
218 void run(Evaluator& eval, Index first, Index last, Index step_size) {
219 const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
220 const Index vectorized_size = (last / PacketSize) * PacketSize;
221 const Index vectorized_step_size = step_size * PacketSize;
222
223 // Use the vector path
224 for (Index i = first * PacketSize; i < vectorized_size;
225 i += vectorized_step_size) {
226 eval.evalPacket(i);
227 }
228 for (Index i = vectorized_size + first; i < last; i += step_size) {
229 eval.evalScalar(i);
230 }
231 }
232};
233
234template <typename Evaluator, typename Index>
235__global__ void
236__launch_bounds__(1024)
237EigenMetaKernel(Evaluator eval, Index size) {
238
239 const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
240 const Index step_size = blockDim.x * gridDim.x;
241
242 const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
243 EigenMetaKernelEval<Evaluator, Index, vectorizable>::run(eval, first_index, size, step_size);
244}
245
246/*static*/
247template <typename Expression, bool Vectorizable>
249 const Expression& expr, const GpuDevice& device) {
250 TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
251 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
252 if (needs_assign) {
253 const int block_size = device.maxCudaThreadsPerBlock();
254 const int max_blocks = device.getNumCudaMultiProcessors() *
255 device.maxCudaThreadsPerMultiProcessor() / block_size;
256 const Index size = array_prod(evaluator.dimensions());
257 // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
258 const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
259
260 LAUNCH_CUDA_KERNEL(
261 (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>),
262 num_blocks, block_size, 0, device, evaluator, size);
263 }
264 evaluator.cleanup();
265}
266
267#endif // __CUDACC__
268#endif // EIGEN_USE_GPU
269
270// SYCL Executor policy
271#ifdef EIGEN_USE_SYCL
272
273template <typename Expression, bool Vectorizable>
274class TensorExecutor<Expression, SyclDevice, Vectorizable> {
275public:
276 static inline void run(const Expression &expr, const SyclDevice &device) {
277 // call TensorSYCL module
278 TensorSycl::run(expr, device);
279 }
280};
281
282#endif
283
284} // end namespace internal
285
286} // end namespace Eigen
287
288#endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
The tensor executor class.
Namespace containing all symbols from the Eigen library.
Definition: AdolcForward:45