Loading...
Searching...
No Matches
TensorReductionSycl.h
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Mehdi Goli Codeplay Software Ltd.
5// Ralph Potter Codeplay Software Ltd.
6// Luke Iwanski Codeplay Software Ltd.
7// Contact: <eigen@codeplay.com>
8//
9// This Source Code Form is subject to the terms of the Mozilla
10// Public License v. 2.0. If a copy of the MPL was not distributed
11// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
12
13/*****************************************************************
14 * TensorSyclPlaceHolderExpr.h
15 *
16 * \brief:
17 * This is the specialisation of the placeholder expression based on the
18 * operation type
19 *
20*****************************************************************/
21
22#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
23#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
24
25namespace Eigen {
26namespace internal {
27
28template<typename CoeffReturnType, typename KernelName> struct syclGenericBufferReducer{
29template<typename BufferTOut, typename BufferTIn>
30static void run(BufferTOut* bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
31 do {
32 auto f = [length, local, bufOut, &bufI](cl::sycl::handler& h) mutable {
33 cl::sycl::nd_range<1> r{cl::sycl::range<1>{std::max(length, local)},
34 cl::sycl::range<1>{std::min(length, local)}};
35 /* Two accessors are used: one to the buffer that is being reduced,
36 * and a second to local memory, used to store intermediate data. */
37 auto aI =
38 bufI.template get_access<cl::sycl::access::mode::read_write>(h);
39 auto aOut =
40 bufOut->template get_access<cl::sycl::access::mode::discard_write>(h);
41 cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write,
42 cl::sycl::access::target::local>
43 scratch(cl::sycl::range<1>(local), h);
44
45 /* The parallel_for invocation chosen is the variant with an nd_item
46 * parameter, since the code requires barriers for correctness. */
47 h.parallel_for<KernelName>(
48 r, [aOut, aI, scratch, local, length](cl::sycl::nd_item<1> id) {
49 size_t globalid = id.get_global(0);
50 size_t localid = id.get_local(0);
51 /* All threads collectively read from global memory into local.
52 * The barrier ensures all threads' IO is resolved before
53 * execution continues (strictly speaking, all threads within
54 * a single work-group - there is no co-ordination between
55 * work-groups, only work-items). */
56 if (globalid < length) {
57 scratch[localid] = aI[globalid];
58 }
59 id.barrier(cl::sycl::access::fence_space::local_space);
60
61 /* Apply the reduction operation between the current local
62 * id and the one on the other half of the vector. */
63 if (globalid < length) {
64 int min = (length < local) ? length : local;
65 for (size_t offset = min / 2; offset > 0; offset /= 2) {
66 if (localid < offset) {
67 scratch[localid] += scratch[localid + offset];
68 }
69 id.barrier(cl::sycl::access::fence_space::local_space);
70 }
71 /* The final result will be stored in local id 0. */
72 if (localid == 0) {
73 aI[id.get_group(0)] = scratch[localid];
74 if((length<=local) && globalid ==0){
75 aOut[globalid]=scratch[localid];
76 }
77 }
78 }
79 });
80 };
81 dev.m_queue.submit(f);
82 dev.m_queue.throw_asynchronous();
83
84 /* At this point, you could queue::wait_and_throw() to ensure that
85 * errors are caught quickly. However, this would likely impact
86 * performance negatively. */
87 length = length / local;
88
89 } while (length > 1);
90
91
92
93}
94
95};
96
101// a leafNode.
102template <typename Self, typename Op, bool Vectorizable>
103struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
104
105 typedef typename Self::CoeffReturnType CoeffReturnType;
106 static const bool HasOptimizedImplementation = false;
107
108 static void run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output) {
109 typedef const typename Self::ChildType HostExpr;
110 typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
111 auto functors = TensorSycl::internal::extractFunctors(self.impl());
112 int red_factor =256;
113 size_t inputSize =self.impl().dimensions().TotalSize();
114 size_t rng = inputSize/red_factor; // the total number of thread initially is half the size of the input
115 size_t remaining = inputSize% red_factor;
116 if(rng ==0) {
117 red_factor=1;
118 };
119 size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2;
120 size_t GRange=std::max((size_t )1, rng);
121
122 // convert global range to power of 2 for redecution
123 GRange--;
124 GRange |= GRange >> 1;
125 GRange |= GRange >> 2;
126 GRange |= GRange >> 4;
127 GRange |= GRange >> 8;
128 GRange |= GRange >> 16;
129#if __x86_64__ || __ppc64__ || _WIN64
130 GRange |= GRange >> 32;
131#endif
132 GRange++;
133 size_t outTileSize = tileSize;
135 if (GRange < outTileSize) outTileSize=GRange;
136 // getting final out buffer at the moment the created buffer is true because there is no need for assign
137 auto out_buffer =dev.template get_sycl_buffer<typename Eigen::internal::remove_all<CoeffReturnType>::type>(self.dimensions().TotalSize(), output);
141 auto temp_global_buffer =cl::sycl::buffer<CoeffReturnType, 1>(cl::sycl::range<1>(GRange));
142 typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
143 Dims dims= self.xprDims();
144 Op functor = reducer;
145 dev.m_queue.submit([&](cl::sycl::handler &cgh) {
146 // create a tuple of accessors from Evaluator
147 auto tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
148 auto tmp_global_accessor = temp_global_buffer. template get_access<cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer>(cgh);
149
150 cgh.parallel_for<PlaceHolderExpr>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)), [=](cl::sycl::nd_item<1> itemID) {
151 typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
152 auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
156 const auto device_self_expr= TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
159 auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
161 auto globalid=itemID.get_global_linear_id();
162
163 if(globalid<rng)
164 tmp_global_accessor.get_pointer()[globalid]=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, red_factor*globalid, red_factor, const_cast<Op&>(functor));
165 else
166 tmp_global_accessor.get_pointer()[globalid]=static_cast<CoeffReturnType>(0);
167
168 if(remaining!=0 && globalid==0 )
169 // this will add the rest of input buffer when the input size is not devidable to red_factor.
170 tmp_global_accessor.get_pointer()[globalid]+=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, red_factor*(rng), remaining, const_cast<Op&>(functor));
171 });
172 });
173 dev.m_queue.throw_asynchronous();
174
176 syclGenericBufferReducer<CoeffReturnType,HostExpr>::run(out_buffer, temp_global_buffer,dev, GRange, outTileSize);
177 }
178
179};
180
181template <typename Self, typename Op>
182struct InnerReducer<Self, Op, const Eigen::SyclDevice> {
183
184 typedef typename Self::CoeffReturnType CoeffReturnType;
185 static const bool HasOptimizedImplementation = false;
186
187 static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index , typename Self::Index num_coeffs_to_preserve) {
188 typedef const typename Self::ChildType HostExpr;
189 typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
190 auto functors = TensorSycl::internal::extractFunctors(self.impl());
191
192 size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2;
193
194 size_t GRange=num_coeffs_to_preserve;
195 if (tileSize>GRange) tileSize=GRange;
196 else if(GRange>tileSize){
197 size_t xMode = GRange % tileSize;
198 if (xMode != 0) GRange += (tileSize - xMode);
199 }
200 // getting final out buffer at the moment the created buffer is true because there is no need for assign
204 typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
205 Dims dims= self.xprDims();
206 Op functor = reducer;
207
208 dev.m_queue.submit([&](cl::sycl::handler &cgh) {
209 // create a tuple of accessors from Evaluator
210 auto tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
211 auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(num_coeffs_to_preserve,cgh, output);
212
213 cgh.parallel_for<Self>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) {
214 typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
215 auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
219 const auto device_self_expr= TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
222 typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice> DeiceSelf;
223 auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
225 auto globalid=itemID.get_global_linear_id();
226 if (globalid< static_cast<size_t>(num_coeffs_to_preserve)) {
227 typename DeiceSelf::CoeffReturnType accum = functor.initialize();
228 GenericDimReducer<DeiceSelf::NumReducedDims-1, DeiceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(globalid),const_cast<Op&>(functor), &accum);
229 functor.finalize(accum);
230 output_accessor.get_pointer()[globalid]= accum;
231 }
232 });
233 });
234 dev.m_queue.throw_asynchronous();
235 return false;
236 }
237};
238
239} // end namespace internal
240} // namespace Eigen
241
242#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
Namespace containing all symbols from the Eigen library.
Definition: AdolcForward:45
A cost model used to limit the number of threads used for evaluating tensor expression.
Definition: TensorEvaluator.h:29