Remove unused code.
[piranha:mainline.git] / src / core / polynomial_common / polynomial_multiplier.h
1 /***************************************************************************
2  *   Copyright (C) 2007, 2008 by Francesco Biscani   *
3  *   bluescarni@gmail.com   *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
19  ***************************************************************************/
20
21 #ifndef PIRANHA_POLYNOMIAL_MULTIPLIER_H
22 #define PIRANHA_POLYNOMIAL_MULTIPLIER_H
23
24 #include <algorithm> // For std::max.
25 #include <boost/integer_traits.hpp>
26 #include <boost/iterator/permutation_iterator.hpp>
27 #include <boost/lambda/lambda.hpp>
28 #include <boost/numeric/conversion/cast.hpp>
29 #include <boost/numeric/interval.hpp>
30 #include <boost/thread/barrier.hpp>
31 #include <boost/thread/thread.hpp>
32 #include <boost/tuple/tuple.hpp>
33 #include <boost/type_traits/integral_constant.hpp>
34 #include <cstddef>
35 #include <exception>
36 #include <stdexcept>
37 #include <utility> // For std::pair.
38 #include <vector>
39
40 #include "../base_classes/base_series_multiplier.h"
41 #include "../base_classes/coded_multiplier.h"
42 #include "../coded_hash_table.h"
43 #include "../exceptions.h"
44 #include "../integer_typedefs.h"
45 #include "../memory.h"
46 #include "../settings.h" // For debug and cache size.
47 #include "../stats.h"
48 #include "../utils.h" // For iota.
49
50 namespace piranha
51 {
52         typedef std::pair<std::size_t,std::size_t> block_type;
53         typedef std::vector<block_type> block_sequence;
54
55         // Generic threaded vector multiplication.
56         template <class Functor>
57         struct threaded_blocked_multiplier
58         {
59                 threaded_blocked_multiplier(const std::size_t &nominal_block_size, const std::size_t &size1, const std::size_t &size2, const std::size_t &thread_id,
60                         const std::size_t &thread_n, boost::barrier *barrier, std::size_t &cur_idx1_start, bool &breakout, Functor &func,
61                         block_sequence &idx_vector1, block_sequence &idx_vector2):
62                         m_nominal_block_size(nominal_block_size),m_size1(size1),m_thread_id(thread_id),m_thread_n(thread_n),
63                         m_barrier(barrier),m_cur_idx1_start(cur_idx1_start),m_breakout(breakout),m_func(func),m_idx_vector1(idx_vector1),m_idx_vector2(idx_vector2)
64                 {
65                         // Sanity checks.
66                         piranha_assert(thread_n > 0 && thread_id < thread_n && (barrier || thread_n == 1));
67                         // Numerical limits check. We need an extra block size buffer at the end to make sure we are able to
68                         // represent all indices and sizes.
69                         // TODO: we need to take care of extra thread_n blocks at the end, they must be representable. Maybe not, after all.
70                         if (nominal_block_size > boost::integer_traits<std::size_t>::const_max || size1 >= boost::integer_traits<std::size_t>::const_max - nominal_block_size ||
71                                 size2 >= boost::integer_traits<std::size_t>::const_max - nominal_block_size)
72                         {
73                                 piranha_throw(std::overflow_error,"numerical overflow in threaded block multiplication");
74                         }
75                 }
76                 void operator()()
77                 {
78                         if (m_thread_id == 0) {
79                                 // The first thread is in charge of the initial setup of the indices vectors.
80                                 // TODO: exception handling, in case of both single and multi thread.
81                                 m_idx_vector1.resize(boost::numeric_cast<block_sequence::size_type>(m_thread_n));
82                                 m_idx_vector2.resize(boost::numeric_cast<block_sequence::size_type>(m_thread_n));
83                                 m_cur_idx1_start = 0;
84                         }
85                         sync();
86                         block_sequence orig2(m_idx_vector2.size());
87                         while (m_cur_idx1_start != m_size1) {
88                                 sync();
89                                 if (m_thread_id == 0) {
90                                         m_func.blocks_setup(m_cur_idx1_start,m_nominal_block_size,m_idx_vector1,m_idx_vector2);
91                                         m_breakout = false;
92                                 }
93                                 sync();
94                                 const std::size_t i_start = m_idx_vector1[m_thread_id].first, i_end = m_idx_vector1[m_thread_id].second;
95                                 // Remember the original block sequence for the second series.
96                                 std::copy(m_idx_vector2.begin(),m_idx_vector2.end(),orig2.begin());
97                                 // Reset the wrap count.
98                                 std::size_t wrap_count = 0;
99                                 while (true) {
100                                         const std::size_t j_start = m_idx_vector2[m_thread_id].first, j_end = m_idx_vector2[m_thread_id].second;
101                                         // NOTE: here maybe we can put a preemptive check on j start/end so that if the inner
102                                         // cycle is empty we skip this part altogether.
103                                         for (std::size_t i = i_start; i < i_end; ++i) {
104                                                 for (std::size_t j = j_start; j < j_end; ++j) {
105                                                         // TODO: truncation.
106                                                         m_func(i,j);
107
108                                                 }
109                                         }
110                                         sync();
111                                         if (m_thread_id == 0) {
112                                                 if (!m_func.block2_advance(m_idx_vector1,m_idx_vector2,m_nominal_block_size,orig2,wrap_count)) {
113                                                         m_breakout = true;
114                                                 }
115                                         }
116                                         sync();
117                                         if (m_breakout) {
118                                                 break;
119                                         }
120                                 }
121                         }
122                 }
123                 void sync()
124                 {
125                         if (m_barrier) {
126                                 m_barrier->wait();
127                         }
128                 }
129                 const std::size_t       m_nominal_block_size;
130                 const std::size_t       m_size1;
131                 const std::size_t       m_thread_id;
132                 const std::size_t       m_thread_n;
133                 boost::barrier          *m_barrier;
134                 std::size_t             &m_cur_idx1_start;
135                 bool                    &m_breakout;
136                 Functor                 &m_func;
137                 block_sequence          &m_idx_vector1;
138                 block_sequence          &m_idx_vector2;
139         };
140
141         /// Series multiplier specifically tuned for polynomials.
142         /**
143          * This multiplier internally will use coded arithmetics if possible, otherwise it will operate just
144          * like piranha::base_series_multiplier.
145          */
146         class polynomial_multiplier
147         {
148                 public:
149                         template <class Series1, class Series2, class ArgsTuple, class Truncator>
150                         class get_type:
151                                 public base_series_multiplier< Series1, Series2, ArgsTuple, Truncator,
152                                         get_type<Series1, Series2, ArgsTuple, Truncator> >,
153                                 public coded_multiplier<get_type<Series1, Series2, ArgsTuple, Truncator>,Series1,Series2,boost::tuple<boost::true_type> >
154                         {
155                                         typedef base_series_multiplier< Series1, Series2, ArgsTuple, Truncator,
156                                                 get_type<Series1, Series2, ArgsTuple, Truncator> > ancestor;
157                                         typedef coded_multiplier<get_type<Series1, Series2, ArgsTuple, Truncator>,Series1,Series2,boost::tuple<boost::true_type> > coded_ancestor;
158                                         friend class coded_multiplier<get_type<Series1, Series2, ArgsTuple, Truncator>,Series1,Series2,boost::tuple<boost::true_type> >;
159                                         typedef typename ancestor::term_type1 term_type1;
160                                         typedef typename ancestor::term_type2 term_type2;
161                                         typedef typename term_type1::cf_type cf_type1;
162                                         typedef typename term_type2::cf_type cf_type2;
163                                 public:
164                                         typedef Series1 series_type1;
165                                         typedef Series2 series_type2;
166                                         typedef ArgsTuple args_tuple_type;
167                                         typedef typename Truncator::template get_type<Series1,Series2,ArgsTuple> truncator_type;
168                                         get_type(const Series1 &s1, const Series2 &s2, Series1 &retval, const ArgsTuple &args_tuple):
169                                                 ancestor(s1, s2, retval, args_tuple) {}
170                                         template <class Functor>
171                                         struct indirect_sorter
172                                         {
173                                                 indirect_sorter(const Functor &func, const std::vector<max_fast_int> &v):m_func(func),m_v(v) {}
174                                                 bool operator()(const std::size_t &n1, const std::size_t &n2) const
175                                                 {
176                                                         // TODO numeric casts here, or maybe one large check at the beginning of the coded multiplier?
177                                                         return m_func.get_mem_pos(m_v[n1]) < m_func.get_mem_pos(m_v[n2]);
178                                                 }
179                                                 const Functor                   &m_func;
180                                                 const std::vector<max_fast_int> &m_v;
181                                         };
182                                         template <class GenericTruncator>
183                                         struct vector_functor {
184                                                 typedef boost::numeric::interval<max_fast_int,boost::numeric::interval_lib::policies<
185                                                         boost::numeric::interval_lib::rounded_math<max_fast_int>,
186                                                         boost::numeric::interval_lib::checking_base<max_fast_int>
187                                                 > > interval;
188                                                 vector_functor(std::vector<cf_type1> &tc1, std::vector<cf_type2> &tc2,
189                                                         std::vector<max_fast_int> &ck1, std::vector<max_fast_int> &ck2,
190                                                         std::vector<term_type1 const *> &t1, std::vector<term_type2 const *> &t2,
191                                                         const GenericTruncator &trunc, cf_type1 *vc_res, const ArgsTuple &args_tuple):
192                                                         m_tc1(tc1),m_tc2(tc2),m_ck1(ck1),m_ck2(ck2),m_t1(t1),m_t2(t2),m_trunc(trunc),
193                                                         m_vc_res(vc_res),m_args_tuple(args_tuple) {}
194                                                 bool operator()(const std::size_t &i, const std::size_t &j)
195                                                 {
196                                                         if (m_trunc.skip(&m_t1[i], &m_t2[j])) {
197                                                                 return false;
198                                                         }
199                                                         // Calculate index of the result.
200                                                         const max_fast_int res_index = m_ck1[i] + m_ck2[j];
201                                                         m_vc_res[res_index].addmul(m_tc1[i],m_tc2[j],m_args_tuple);
202                                                         return true;
203                                                 }
204                                                 void blocks_setup(std::size_t &cur_idx1_start, const std::size_t &block_size,
205                                                         block_sequence &idx_vector1, block_sequence &idx_vector2)
206                                                 {
207                                                         piranha_assert(cur_idx1_start < m_tc1.size());
208                                                         // Only the initial sorting is needed, for vector coded.
209                                                         if (cur_idx1_start == 0) {
210                                                                 initial_setup();
211                                                         }
212                                                         typedef block_sequence::size_type size_type;
213                                                         // Tentatively divide into homogeneous blocks.
214                                                         for (size_type i = 0; i < idx_vector1.size(); ++i) {
215                                                                 idx_vector1[i].first = cur_idx1_start + i * block_size;
216                                                                 idx_vector1[i].second = idx_vector1[i].first + block_size;
217                                                         }
218                                                         for (size_type i = 0; i < idx_vector2.size(); ++i) {
219                                                                 idx_vector2[i].first = i * block_size;
220                                                                 idx_vector2[i].second = idx_vector2[i].first + block_size;
221                                                         }
222                                                         // Now we must check the blocks for the following conditions:
223                                                         // 1 - we must not be past the end of the series.
224                                                         // 2 - the macroblocks must not result in overlapping areas in the output structure.
225                                                         // 3 - the upper bound of each block must be different from the lower bound of next block.
226                                                         // ---
227                                                         // 1 - If we are past the end of the series, reduce the block sizes.
228                                                         reduce_macroblock(idx_vector1,m_tc1,cur_idx1_start);
229                                                         reduce_macroblock(idx_vector2,m_tc2,0);
230                                                         // 2 - Check macroblocks mult results do not overlap.
231                                                         adjust_overlapping(idx_vector1,idx_vector2,m_ck1,m_ck2);
232                                                         // 3 - Blocks boundaries check.
233                                                         adjust_block_boundaries(idx_vector1,idx_vector2,m_ck1,m_ck2);
234                                                         // Finally, update the cur_idx1.
235                                                         cur_idx1_start = idx_vector1.back().second;
236 // std::cout << "init\n";
237 // for (std::size_t i = 0; i < idx_vector1.size(); ++i) {
238 //      std::cout << idx_vector1[i].first << ',' << idx_vector1[i].second << '\n';
239 // }
240 // for (std::size_t i = 0; i < idx_vector2.size(); ++i) {
241 //      std::cout << idx_vector2[i].first << ',' << idx_vector2[i].second << '\n';
242 // }
243 // std::cout << "blah\n";
244                                                 }
245                                                 static void adjust_overlapping(block_sequence &,block_sequence &,
246                                                         const std::vector<max_fast_int> &, const std::vector<max_fast_int> &)
247                                                 {
248                                                         // TODO: to be done for hash coded.
249                                                 }
250                                                 static void adjust_block_boundaries(block_sequence &,block_sequence &,
251                                                         const std::vector<max_fast_int> &, const std::vector<max_fast_int> &)
252                                                 {
253                                                         // TODO: to be done for hash coded.
254                                                 }
255                                                 bool block2_advance(const block_sequence &idx_vector1, block_sequence &idx_vector2,
256                                                         const std::size_t &block_size, const block_sequence &orig2, std::size_t &wrap_count) const
257                                                 {
258                                                         piranha_assert(idx_vector1.size() == idx_vector2.size() && idx_vector1.size() > 0);
259                                                         if (wrap_count) {
260                                                                 piranha_assert(wrap_count < idx_vector2.size());
261                                                                 if (wrap_count == idx_vector2.size() - 1) {
262                                                                         // This means we are at the end.
263                                                                         return false;
264                                                                 }
265                                                                 // Shift down the blocks.
266                                                                 std::copy(idx_vector2.begin() + 1,idx_vector2.end(),idx_vector2.begin());
267                                                                 // Get the new block from the originals.
268                                                                 idx_vector2.back() = orig2[wrap_count];
269                                                                 // Increase the wrap count.
270                                                                 ++wrap_count;
271                                                         } else {
272                                                                 // Shift down the blocks.
273                                                                 std::copy(idx_vector2.begin() + 1,idx_vector2.end(),idx_vector2.begin());
274                                                                 // Set the new starting point for the last block.
275                                                                 idx_vector2.back().first = idx_vector2.back().second;
276                                                                 // Add the block size or stop at the end of the series, if necessary.
277                                                                 idx_vector2.back().second = std::min<std::size_t>(m_tc2.size(),idx_vector2.back().first + block_size);
278                                                                 // Now check if we are at the end of the first phase.
279                                                                 if (idx_vector2.front() == block_type(m_tc2.size(),m_tc2.size())) {
280                                                                         if (idx_vector2.size() > 1) {
281                                                                                 // If multi-threaded, insert at the end the first original block.
282                                                                                 idx_vector2.back() = orig2.front();
283                                                                                 // Start the wrap count.
284                                                                                 wrap_count = 1;
285                                                                         } else {
286                                                                                 // In single-threaded, this means we have finished.
287                                                                                 return false;
288                                                                         }
289                                                                 } else if (idx_vector2.size() > 1) {
290                                                                         // If we are not at the end of the first phase and we are multithreaded, we need to make sure the newly-added
291                                                                         // block does not overlap.
292                                                                         // NOTE: maybe this function can be replaced by direct check that the last block of first series
293                                                                         // by the newly added block in second series do not overlap with the remaining macroblock 1 by remaining
294                                                                         // macro block 2.
295                                                                         while (sequences_overlap(idx_vector1,idx_vector2)) {
296                                                                                 piranha_assert(idx_vector2.back().second >= idx_vector2.back().first);
297                                                                                 idx_vector2.back().second = idx_vector2.back().first +
298                                                                                         (idx_vector2.back().second - idx_vector2.back().first) / 2;
299                                                                         }
300                                                                 }
301                                                         }
302                                                         // Make sure we have no overlaps.
303                                                         piranha_assert(!sequences_overlap(idx_vector1,idx_vector2));
304 // std::cout << "after advance\n";
305 // for (std::size_t i = 0; i < idx_vector1.size(); ++i) {
306 //      std::cout << idx_vector1[i].first << ',' << idx_vector1[i].second << '\n';
307 // }
308 // for (std::size_t i = 0; i < idx_vector2.size(); ++i) {
309 //      std::cout << idx_vector2[i].first << ',' << idx_vector2[i].second << '\n';
310 // }
311 // std::cout << "blappo\n";
312                                                         return true;
313                                                 }
314                                                 static bool interval_sorter(const interval &i1, const interval &i2)
315                                                 {
316                                                         return i1.lower() < i2.lower();
317                                                 }
318                                                 bool sequences_overlap(const block_sequence &s1, const block_sequence &s2) const
319                                                 {
320                                                         piranha_assert(s1.size() == s2.size() && s1.size() > 0);
321                                                         typedef std::vector<interval>::size_type size_type;
322                                                         std::vector<interval> vi;
323                                                         for (size_type i = 0; i < s1.size(); ++i) {
324                                                                 std::pair<interval,interval> tmp(blocks_to_intervals(s1[i],s2[i]));
325                                                                 if (!boost::numeric::empty(tmp.first)) {
326                                                                         vi.push_back(tmp.first);
327                                                                 }
328                                                                 if (!boost::numeric::empty(tmp.second)) {
329                                                                         vi.push_back(tmp.second);
330                                                                 }
331                                                         }
332                                                         if (!vi.size()) {
333                                                                 return false;
334                                                         }
335                                                         // Sort according to lower bound of the interval.
336                                                         std::sort(vi.begin(),vi.end(),interval_sorter);
337                                                         piranha_assert(vi.size() > 0);
338                                                         // Check that all intervals are disjoint.
339                                                         for (std::vector<interval>::size_type i = 0; i < vi.size() - 1; ++i) {
340                                                                 if (vi[i].upper() >= vi[i + 1].lower()) {
341                                                                         return true;
342                                                                 }
343                                                         }
344                                                         return false;
345                                                 }
346                                                 const max_fast_int &get_mem_pos(const max_fast_int &n) const
347                                                 {
348                                                         return n;
349                                                 }
350                                                 // Return the two intervals in indices in the output structure containing the results of
351                                                 // one block-by-block multiplication.
352                                                 std::pair<interval,interval> blocks_to_intervals(const block_type &b1, const block_type &b2) const
353                                                 {
354                                                         piranha_assert(b1.first <= b1.second && b2.first <= b2.second);
355                                                         // If at least one of the blocks is empty, then we won't be writing into any interval of indices.
356                                                         if (b1.first == b1.second || b2.first == b2.second) {
357                                                                 return std::make_pair(interval::empty(),interval::empty());
358                                                         }
359                                                         // In case of vector coded, we always end up with a single interval in output.
360                                                         return std::make_pair(interval(m_ck1[b1.first] + m_ck2[b2.first],m_ck1[b1.second - 1] + m_ck2[b2.second - 1]),
361                                                                 interval::empty());
362                                                 }
363                                                 template <class Vector>
364                                                 static void reduce_macroblock(block_sequence &idx_vector, const Vector &tc, const std::size_t &cur_idx_start)
365                                                 {
366                                                         typedef block_sequence::size_type size_type;
367                                                         if (idx_vector.back().second > tc.size()) {
368                                                                 if (tc.size() - cur_idx_start >= idx_vector.size()) {
369                                                                         // If the number of remainder terms is at least equal to the number of threads,
370                                                                         // let's break it down in (almost) equal parts.
371                                                                         const std::size_t new_block_size = (tc.size() - cur_idx_start) / idx_vector.size();
372                                                                         size_type i = 0;
373                                                                         for (; i < idx_vector.size() - 1; ++i) {
374                                                                                 idx_vector[i].first = cur_idx_start + i * new_block_size;
375                                                                                 idx_vector[i].second = idx_vector[i].first + new_block_size;
376                                                                         }
377                                                                         // Last block might be inhomogeneous, handle it separately.
378                                                                         idx_vector.back().first = cur_idx_start + i * new_block_size;
379                                                                         idx_vector.back().second = tc.size();
380                                                                 } else {
381                                                                         // If the number of remainder terms r is less than the number of threads,
382                                                                         // assign each of the first r terms to a single thread and collapse the remaining blocks.
383                                                                         size_type i = 0;
384                                                                         for (; i < tc.size() - cur_idx_start; ++i) {
385                                                                                 idx_vector[i].first = cur_idx_start + i;
386                                                                                 idx_vector[i].second = idx_vector[i].first + 1;
387                                                                         }
388                                                                         for (; i < idx_vector.size(); ++i) {
389                                                                                 idx_vector[i].first = tc.size();
390                                                                                 idx_vector[i].second = tc.size();
391                                                                         }
392                                                                 }
393                                                         }
394                                                 }
395                                                 void initial_setup()
396                                                 {
397                                                         // Build the permutation vectors.
398                                                         typedef std::vector<std::size_t>::size_type size_type;
399                                                         std::vector<std::size_t> perm1(boost::numeric_cast<size_type>(m_ck1.size())), perm2(boost::numeric_cast<size_type>(m_ck2.size()));
400                                                         iota(perm1.begin(),perm1.end(),std::size_t(0));
401                                                         iota(perm2.begin(),perm2.end(),std::size_t(0));
402                                                         // Sort the permutation vectors.
403                                                         std::sort(perm1.begin(),perm1.end(),indirect_sorter<vector_functor>(*this,m_ck1));
404                                                         std::sort(perm2.begin(),perm2.end(),indirect_sorter<vector_functor>(*this,m_ck2));
405                                                         // Apply the permutations to the other vectors.
406                                                         apply_permutation(perm1,m_tc1);
407                                                         apply_permutation(perm1,m_ck1);
408                                                         apply_permutation(perm1,m_t1);
409                                                         apply_permutation(perm2,m_tc2);
410                                                         apply_permutation(perm2,m_ck2);
411                                                         apply_permutation(perm2,m_t2);
412                                         }
413                                                 // TODO: rewrite with iterators for genericity? Or maybe provide alternative version.
414                                                 template <class T>
415                                                 static void apply_permutation(const std::vector<std::size_t> &perm, std::vector<T> &v)
416                                                 {
417                                                         typedef boost::permutation_iterator<typename std::vector<T>::iterator,std::vector<std::size_t>::const_iterator> perm_iterator;
418                                                         std::vector<T> other(v.size());
419                                                         std::copy(perm_iterator(v.begin(),perm.begin()),perm_iterator(v.end(),perm.end()),other.begin());
420                                                         other.swap(v);
421                                                 }
422                                                 std::vector<cf_type1>           &m_tc1;
423                                                 std::vector<cf_type2>           &m_tc2;
424                                                 std::vector<max_fast_int>       &m_ck1;
425                                                 std::vector<max_fast_int>       &m_ck2;
426                                                 std::vector<term_type1 const *> &m_t1;
427                                                 std::vector<term_type2 const *> &m_t2;
428                                                 const GenericTruncator          &m_trunc;
429                                                 cf_type1                        *m_vc_res;
430                                                 const ArgsTuple                 &m_args_tuple;
431                                         };
432                                         template <class GenericTruncator>
433                                         bool perform_vector_coded_multiplication(std::vector<cf_type1> &tc1, std::vector<cf_type2> &tc2,
434                                                 std::vector<term_type1 const *> &t1, std::vector<term_type2 const *> &t2, const GenericTruncator &trunc)
435                                         {
436                                                 std::vector<cf_type1,std_counting_allocator<cf_type1> > vc;
437                                                 // Try to allocate the space for vector coded multiplication.
438                                                 // The +1 is needed because we need the number of possible codes between min and max.
439                                                 piranha_assert(boost::numeric::width(this->m_fast_h) + 1 >= 0);
440                                                 const std::size_t n_codes = boost::numeric_cast<std::size_t>(boost::numeric::width(this->m_fast_h) + 1);
441                                                 try {
442                                                         vc.resize(n_codes);
443                                                 } catch (const std::bad_alloc &) {
444                                                         __PDEBUG(std::cout << "Not enough physical memory available for vector coded.\n");
445                                                         return false;
446                                                 } catch (const memory_error &) {
447                                                         __PDEBUG(std::cout << "Memory limit reached for vector coded.\n");
448                                                         return false;
449                                                 }
450                                                 __PDEBUG(std::cout << "Going for vector coded polynomial multiplication\n");
451                                                 // Define the base pointers for storing the results of multiplication.
452                                                 // NOTE: even if here it seems like we are going to write outside allocated memory,
453                                                 //       the indices from the analysis of the coded series will prevent out-of-boundaries
454                                                 //       reads/writes. The thing works like this: we have ncodes slots allocated, so memory
455                                                 //       indices in [0,ncodes - 1]. But since we are doing arithmetics on shifted codes, the
456                                                 //       code range is [-h_min,ncodes - 1 - h_min]. So we shift the baseline memory location
457                                                 //       so that we can use shifted codes directly as indices.
458                                                 const std::size_t size1 = this->m_terms1.size(), size2 = this->m_terms2.size();
459 //std::cout << "sizes: " << size1 << ',' << size2 << '\n';
460                                                 piranha_assert(size1 && size2);
461                                                 const args_tuple_type &args_tuple = this->m_args_tuple;
462                                                 cf_type1 *vc_res =  &vc[0] - this->m_fast_h.lower();
463                                                 // Find out a suitable block size.
464                                                 const std::size_t block_size = this->template compute_block_size<sizeof(cf_type1)>();
465                                                 __PDEBUG(std::cout << "Block size: " << block_size << '\n');
466 // std::cout << "Block size: " << block_size << '\n';
467                                                 // Perform multiplication.
468                                                 typedef vector_functor<GenericTruncator> vf_type;
469                                                 vf_type vm(tc1,tc2,this->m_ckeys1,this->m_ckeys2a,t1,t2,trunc,vc_res,args_tuple);
470                                                 const std::size_t nthread = settings::get_nthread();
471 // const boost::posix_time::ptime time0 = boost::posix_time::microsec_clock::local_time();
472                                                 // Variables needed by the multiplier.
473                                                 block_sequence s1, s2;
474                                                 bool breakout = false;
475                                                 std::size_t cur_idx1_start = 0;
476                                                 // TODO: probably we need to rethink a bit this, taking into account also the number of threads. Also drop the truncation limitation.
477                                                 if (trunc.is_effective() || (this->m_terms1.size() * this->m_terms2.size()) <= 400 || nthread == 1) {
478                                                         stats::trace_stat("mult_st",std::size_t(0),boost::lambda::_1 + 1);
479                                                         threaded_blocked_multiplier<vf_type> t(block_size,size1,size2,0,1,0,cur_idx1_start,breakout,vm,s1,s2);
480                                                         t();
481                                                 } else {
482 // std::cout << "using " << nthread << " threads\n";
483                                                         stats::trace_stat("mult_mt",std::size_t(0),boost::lambda::_1 + 1);
484                                                         boost::thread_group tg;
485                                                         boost::barrier b(nthread);
486                                                         for (std::size_t i = 0; i < nthread; ++i) {
487                                                                 tg.create_thread(threaded_blocked_multiplier<vf_type>(block_size,size1,size2,i,nthread,&b,cur_idx1_start,breakout,vm,s1,s2));
488                                                         }
489                                                         tg.join_all();
490                                                 }
491 // std::cout << "Elapsed time: " << (double)(boost::posix_time::microsec_clock::local_time() - time0).total_microseconds() / 1000 << '\n';
492                                                 __PDEBUG(std::cout << "Done multiplying\n");
493                                                 const max_fast_int i_f = this->m_fast_h.upper();
494                                                 // Decode and insert the results into return value.
495                                                 term_type1 tmp_term;
496                                                 for (max_fast_int i = this->m_fast_h.lower(); i <= i_f; ++i) {
497                                                         // Take a shortcut and check for ignorability of the coefficient here.
498                                                         // This way we avoid decodification, and all the series term insertion yadda-yadda.
499                                                         if (!vc_res[i].is_ignorable(args_tuple)) {
500                                                                 this->decode(vc_res[i], i,tmp_term);
501                                                                 if (!tmp_term.is_canonical(args_tuple)) {
502                                                                         tmp_term.canonicalise(args_tuple);
503                                                                 }
504                                                                 this->m_retval.insert(tmp_term, args_tuple);
505                                                         }
506                                                 }
507                                                 __PDEBUG(std::cout << "Done polynomial vector coded.\n");
508                                                 return true;
509                                         }
510                                         template <class Cf, class Ckey, class GenericTruncator, class HashSet>
511                                         struct hash_functor {
512                                                 hash_functor(std::pair<Cf,Ckey> &cterm, std::vector<cf_type1> &tc1, std::vector<cf_type2> &tc2,
513                                                         std::vector<Ckey> &ck1, std::vector<Ckey> &ck2,
514                                                         std::vector<const term_type1 *> &t1, std::vector<const term_type2 *> &t2,
515                                                         const GenericTruncator &trunc, HashSet *cms, const ArgsTuple &args_tuple):
516                                                         m_cterm(cterm),m_tc1(tc1),m_tc2(tc2),m_ck1(ck1),m_ck2(ck2),m_t1(t1),m_t2(t2),
517                                                         m_trunc(trunc),m_cms(cms),m_args_tuple(args_tuple) {}
518                                                 bool operator()(const std::size_t &i, const std::size_t &j)
519                                                 {
520                                                         typedef typename HashSet::iterator c_iterator;
521                                                         if (m_trunc.skip(&m_t1[i], &m_t2[j])) {
522                                                                 return false;
523                                                         }
524                                                         m_cterm.second = m_ck1[i];
525                                                         m_cterm.second += m_ck2[j];
526                                                         std::pair<bool,c_iterator> res = m_cms->find(m_cterm.second);
527                                                         if (res.first) {
528                                                                 res.second->first.addmul(m_tc1[i],m_tc2[j],m_args_tuple);
529                                                         } else {
530                                                                 // Assign to the temporary term the old cf (new_key is already assigned).
531                                                                 m_cterm.first = m_tc1[i];
532                                                                 // Multiply the old term by the second term.
533                                                                 m_cterm.first.mult_by(m_tc2[j],m_args_tuple);
534                                                                 m_cms->insert_new(m_cterm,res.second);
535                                                         }
536                                                         return true;
537                                                 }
538                                                 std::pair<Cf,Ckey>              &m_cterm;
539                                                 std::vector<cf_type1>           &m_tc1;
540                                                 std::vector<cf_type2>           &m_tc2;
541                                                 std::vector<Ckey>               &m_ck1;
542                                                 std::vector<Ckey>               &m_ck2;
543                                                 std::vector<const term_type1 *> &m_t1;
544                                                 std::vector<const term_type2 *> &m_t2;
545                                                 const GenericTruncator          &m_trunc;
546                                                 HashSet                         *m_cms;
547                                                 const ArgsTuple                 &m_args_tuple;
548                                         };
549                                         template <class GenericTruncator>
550                                         void perform_hash_coded_multiplication(std::vector<cf_type1> &tc1, std::vector<cf_type2> &tc2,
551                                                 std::vector<const term_type1 *> &t1, std::vector<const term_type2 *> &t2, const GenericTruncator &trunc)
552                                         {
553                                                 typedef coded_hash_table<cf_type1, max_fast_int, std_counting_allocator<char> > csht;
554                                                 typedef typename csht::iterator c_iterator;
555                                                 stats::trace_stat("mult_st",std::size_t(0),boost::lambda::_1 + 1);
556                                                 // Let's find a sensible size hint.
557                                                 const std::size_t n_codes = boost::numeric_cast<std::size_t>(boost::numeric::width(this->m_fast_h) + 1);
558                                                 const std::size_t size_hint = static_cast<std::size_t>(
559                                                         std::max<double>(this->m_density1,this->m_density2) * n_codes);
560                                                 const std::size_t size1 = this->m_terms1.size(), size2 = this->m_terms2.size();
561                                                 piranha_assert(size1 && size2);
562                                                 const args_tuple_type &args_tuple = this->m_args_tuple;
563                                                 csht cms(size_hint);
564                                                 // Find out a suitable block size.
565                                                 const std::size_t block_size = this->template compute_block_size<sizeof(std::pair<cf_type1,max_fast_int>)>();
566                                                 __PDEBUG(std::cout << "Block size: " << block_size << '\n');
567 // std::cout << "Block size: " << block_size << '\n';
568 // const boost::posix_time::ptime time0 = boost::posix_time::microsec_clock::local_time();
569                                                 std::pair<cf_type1,max_fast_int> cterm;
570                                                 hash_functor<cf_type1,max_fast_int,GenericTruncator,csht>
571                                                         hm(cterm,tc1,tc2,this->m_ckeys1,this->m_ckeys2a,t1,t2,trunc,&cms,args_tuple);
572                                                 this->blocked_multiplication(block_size,size1,size2,hm);
573 // std::cout << "Elapsed time: " << (double)(boost::posix_time::microsec_clock::local_time() - time0).total_microseconds() / 1000 << '\n';
574                                                 __PDEBUG(std::cout << "Done polynomial hash coded multiplying\n");
575                                                 // Decode and insert into retval.
576                                                 // TODO: add debug info about cms' size here.
577                                                 const c_iterator c_it_f = cms.end();
578                                                 term_type1 tmp_term;
579                                                 for (c_iterator c_it = cms.begin(); c_it != c_it_f; ++c_it) {
580                                                         this->decode(c_it->first,c_it->second + 2 * this->m_fast_h.lower(),tmp_term);
581                                                         if (!tmp_term.is_canonical(args_tuple)) {
582                                                                 tmp_term.canonicalise(args_tuple);
583                                                         }
584                                                         this->m_retval.insert(tmp_term, args_tuple);
585                                                 }
586                                                 __PDEBUG(std::cout << "Done polynomial hash coded\n");
587                                         }
588                         };
589         };
590 }
591
592 #endif