Loading...
Searching...
No Matches
TensorScan.h
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Copyright (C) 2016 Igor Babuschkin <igor@babuschk.in>
5//
6// This Source Code Form is subject to the terms of the Mozilla
7// Public License v. 2.0. If a copy of the MPL was not distributed
8// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
10#ifndef EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
11#define EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
12
13namespace Eigen {
14
15namespace internal {
16
17template <typename Op, typename XprType>
18struct traits<TensorScanOp<Op, XprType> >
19 : public traits<XprType> {
20 typedef typename XprType::Scalar Scalar;
21 typedef traits<XprType> XprTraits;
22 typedef typename XprTraits::StorageKind StorageKind;
23 typedef typename XprType::Nested Nested;
24 typedef typename remove_reference<Nested>::type _Nested;
25 static const int NumDimensions = XprTraits::NumDimensions;
26 static const int Layout = XprTraits::Layout;
27};
28
29template<typename Op, typename XprType>
30struct eval<TensorScanOp<Op, XprType>, Eigen::Dense>
31{
32 typedef const TensorScanOp<Op, XprType>& type;
33};
34
35template<typename Op, typename XprType>
36struct nested<TensorScanOp<Op, XprType>, 1,
37 typename eval<TensorScanOp<Op, XprType> >::type>
38{
39 typedef TensorScanOp<Op, XprType> type;
40};
41} // end namespace internal
42
48template <typename Op, typename XprType>
49class TensorScanOp
50 : public TensorBase<TensorScanOp<Op, XprType>, ReadOnlyAccessors> {
51public:
52 typedef typename Eigen::internal::traits<TensorScanOp>::Scalar Scalar;
53 typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
54 typedef typename XprType::CoeffReturnType CoeffReturnType;
55 typedef typename Eigen::internal::nested<TensorScanOp>::type Nested;
56 typedef typename Eigen::internal::traits<TensorScanOp>::StorageKind StorageKind;
57 typedef typename Eigen::internal::traits<TensorScanOp>::Index Index;
58
59 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorScanOp(
60 const XprType& expr, const Index& axis, bool exclusive = false, const Op& op = Op())
61 : m_expr(expr), m_axis(axis), m_accumulator(op), m_exclusive(exclusive) {}
62
63 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
64 const Index axis() const { return m_axis; }
65 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
66 const XprType& expression() const { return m_expr; }
67 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
68 const Op accumulator() const { return m_accumulator; }
69 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
70 bool exclusive() const { return m_exclusive; }
71
72protected:
73 typename XprType::Nested m_expr;
74 const Index m_axis;
75 const Op m_accumulator;
76 const bool m_exclusive;
77};
78
79template <typename Self, typename Reducer, typename Device>
80struct ScanLauncher;
81
82// Eval as rvalue
83template <typename Op, typename ArgType, typename Device>
84struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
85
86 typedef TensorScanOp<Op, ArgType> XprType;
87 typedef typename XprType::Index Index;
88 static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
89 typedef DSizes<Index, NumDims> Dimensions;
90 typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
91 typedef typename XprType::CoeffReturnType CoeffReturnType;
92 typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
93 typedef TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> Self;
94
95 enum {
96 IsAligned = false,
97 PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
98 BlockAccess = false,
99 Layout = TensorEvaluator<ArgType, Device>::Layout,
100 CoordAccess = false,
101 RawAccess = true
102 };
103
104 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
105 const Device& device)
106 : m_impl(op.expression(), device),
107 m_device(device),
108 m_exclusive(op.exclusive()),
109 m_accumulator(op.accumulator()),
110 m_size(m_impl.dimensions()[op.axis()]),
111 m_stride(1),
112 m_output(NULL) {
113
114 // Accumulating a scalar isn't supported.
115 EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
116 eigen_assert(op.axis() >= 0 && op.axis() < NumDims);
117
118 // Compute stride of scan axis
119 const Dimensions& dims = m_impl.dimensions();
120 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
121 for (int i = 0; i < op.axis(); ++i) {
122 m_stride = m_stride * dims[i];
123 }
124 } else {
125 for (int i = NumDims - 1; i > op.axis(); --i) {
126 m_stride = m_stride * dims[i];
127 }
128 }
129 }
130
131 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
132 return m_impl.dimensions();
133 }
134
135 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride() const {
136 return m_stride;
137 }
138
139 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& size() const {
140 return m_size;
141 }
142
143 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op& accumulator() const {
144 return m_accumulator;
145 }
146
147 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool exclusive() const {
148 return m_exclusive;
149 }
150
151 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& inner() const {
152 return m_impl;
153 }
154
155 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const {
156 return m_device;
157 }
158
159 EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
160 m_impl.evalSubExprsIfNeeded(NULL);
161 ScanLauncher<Self, Op, Device> launcher;
162 if (data) {
163 launcher(*this, data);
164 return false;
165 }
166
167 const Index total_size = internal::array_prod(dimensions());
168 m_output = static_cast<CoeffReturnType*>(m_device.allocate(total_size * sizeof(Scalar)));
169 launcher(*this, m_output);
170 return true;
171 }
172
173 template<int LoadMode>
174 EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
175 return internal::ploadt<PacketReturnType, LoadMode>(m_output + index);
176 }
177
178 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const
179 {
180 return m_output;
181 }
182
183 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
184 {
185 return m_output[index];
186 }
187
188 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const {
189 return TensorOpCost(sizeof(CoeffReturnType), 0, 0);
190 }
191
192 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
193 if (m_output != NULL) {
194 m_device.deallocate(m_output);
195 m_output = NULL;
196 }
197 m_impl.cleanup();
198 }
199
200protected:
201 TensorEvaluator<ArgType, Device> m_impl;
202 const Device& m_device;
203 const bool m_exclusive;
204 Op m_accumulator;
205 const Index m_size;
206 Index m_stride;
207 CoeffReturnType* m_output;
208};
209
210// CPU implementation of scan
211// TODO(ibab) This single-threaded implementation should be parallelized,
212// at least by running multiple scans at the same time.
213template <typename Self, typename Reducer, typename Device>
214struct ScanLauncher {
215 void operator()(Self& self, typename Self::CoeffReturnType *data) {
216 Index total_size = internal::array_prod(self.dimensions());
217
218 // We fix the index along the scan axis to 0 and perform a
219 // scan per remaining entry. The iteration is split into two nested
220 // loops to avoid an integer division by keeping track of each idx1 and idx2.
221 for (Index idx1 = 0; idx1 < total_size; idx1 += self.stride() * self.size()) {
222 for (Index idx2 = 0; idx2 < self.stride(); idx2++) {
223 // Calculate the starting offset for the scan
224 Index offset = idx1 + idx2;
225
226 // Compute the scan along the axis, starting at the calculated offset
227 typename Self::CoeffReturnType accum = self.accumulator().initialize();
228 for (Index idx3 = 0; idx3 < self.size(); idx3++) {
229 Index curr = offset + idx3 * self.stride();
230
231 if (self.exclusive()) {
232 data[curr] = self.accumulator().finalize(accum);
233 self.accumulator().reduce(self.inner().coeff(curr), &accum);
234 } else {
235 self.accumulator().reduce(self.inner().coeff(curr), &accum);
236 data[curr] = self.accumulator().finalize(accum);
237 }
238 }
239 }
240 }
241 }
242};
243
244#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
245
246// GPU implementation of scan
247// TODO(ibab) This placeholder implementation performs multiple scans in
248// parallel, but it would be better to use a parallel scan algorithm and
249// optimize memory access.
250template <typename Self, typename Reducer>
251__global__ void ScanKernel(Self self, Index total_size, typename Self::CoeffReturnType* data) {
252 // Compute offset as in the CPU version
253 Index val = threadIdx.x + blockIdx.x * blockDim.x;
254 Index offset = (val / self.stride()) * self.stride() * self.size() + val % self.stride();
255
256 if (offset + (self.size() - 1) * self.stride() < total_size) {
257 // Compute the scan along the axis, starting at the calculated offset
258 typename Self::CoeffReturnType accum = self.accumulator().initialize();
259 for (Index idx = 0; idx < self.size(); idx++) {
260 Index curr = offset + idx * self.stride();
261 if (self.exclusive()) {
262 data[curr] = self.accumulator().finalize(accum);
263 self.accumulator().reduce(self.inner().coeff(curr), &accum);
264 } else {
265 self.accumulator().reduce(self.inner().coeff(curr), &accum);
266 data[curr] = self.accumulator().finalize(accum);
267 }
268 }
269 }
270 __syncthreads();
271
272}
273
274template <typename Self, typename Reducer>
275struct ScanLauncher<Self, Reducer, GpuDevice> {
276 void operator()(const Self& self, typename Self::CoeffReturnType* data) {
277 Index total_size = internal::array_prod(self.dimensions());
278 Index num_blocks = (total_size / self.size() + 63) / 64;
279 Index block_size = 64;
280 LAUNCH_CUDA_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data);
281 }
282};
283#endif // EIGEN_USE_GPU && __CUDACC__
284
285} // end namespace Eigen
286
287#endif // EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
Namespace containing all symbols from the Eigen library.
Definition: AdolcForward:45
const Device & device() const
required by sycl in order to construct sycl buffer from raw pointer
Definition: TensorEvaluator.h:114