libMVL
Mappable vector library
libMVL_sort.cc
1 #include <algorithm>
2 #include <vector>
3 #include <functional>
4 #include "pdqsort.h"
5 #include "pdqidxsort.h"
6 
7 #define MVL_STATIC_MEMBERS 1
8 #include "libMVL.h"
9 
10 
11 template <class Numeric>
12 static void sort_indices_asc(LIBMVL_OFFSET64 count, LIBMVL_OFFSET64 *indices, Numeric *data)
13 {
14 
15 pdqidxsort_branchless(data, data+count, indices, [](Numeric a, Numeric b) { return (a<b);});
16 
17 // for(int i=0;i<stop-start;i++) {
18 // if(values[i]!=data[indices[i+start]])
19 // fprintf(stderr, "%d %.12g %.12g %lld\n", i, values[i], data[indices[i+start]], indices[i+start]);
20 // }
21 
22 // for(int i=0;i<stop-start-1;i++) {
23 // if(values[i]>values[i+1])fprintf(stderr, "%d %g %g\n", i, values[i], values[i+1]);
24 // if(data[indices[i+start]]>data[indices[i+1+start]])fprintf(stderr, "%d %g %g %lld %lld\n", i, data[indices[i+start]], data[indices[i+1+start]], indices[i+1], indices[i+1+start]);
25 // }
26 }
27 
28 template <class Numeric>
29 static void sort_indices_desc(LIBMVL_OFFSET64 count, LIBMVL_OFFSET64 *indices, Numeric *data)
30 {
31 
32 pdqidxsort_branchless(data, data+count, indices, [](Numeric a, Numeric b) { return (a>b);});
33 
34 // for(int i=0;i<stop-start;i++) {
35 // if(values[i]!=data[indices[i+start]])
36 // fprintf(stderr, "%d %.12g %.12g %lld\n", i, values[i], data[indices[i+start]], indices[i+start]);
37 // }
38 
39 // for(int i=0;i<stop-start-1;i++) {
40 // if(values[i]>values[i+1])fprintf(stderr, "%d %g %g\n", i, values[i], values[i+1]);
41 // if(data[indices[i+start]]>data[indices[i+1+start]])fprintf(stderr, "%d %g %g %lld %lld\n", i, data[indices[i+start]], data[indices[i+1+start]], indices[i+1], indices[i+1+start]);
42 // }
43 }
44 
45 static void sort_indices_packed_list64_asc(LIBMVL_OFFSET64 start, LIBMVL_OFFSET64 stop, LIBMVL_OFFSET64 *indices, LIBMVL_VECTOR *vec, void *data)
46 {
47 std::sort(indices+start, indices+stop, [vec, data](LIBMVL_OFFSET64 i1, LIBMVL_OFFSET64 i2) {
48  LIBMVL_OFFSET64 al, bl, nn;
49  const unsigned char *ad, *bd;
52  ad=mvl_packed_list_get_entry(vec, data, i1);
53  bd=mvl_packed_list_get_entry(vec, data, i2);
54  nn=al;
55  if(bl<nn)nn=bl;
56  for(LIBMVL_OFFSET64 j=0;j<nn;j++) {
57  if(ad[j]<bd[j])return true;
58  if(ad[j]>bd[j])return false;
59  }
60  return(al<bl);
61  });
62 }
63 
64 static void sort_indices_packed_list64_desc(LIBMVL_OFFSET64 start, LIBMVL_OFFSET64 stop, LIBMVL_OFFSET64 *indices, LIBMVL_VECTOR *vec, void *data)
65 {
66 std::sort(indices+start, indices+stop, [vec, data](LIBMVL_OFFSET64 i1, LIBMVL_OFFSET64 i2) {
67  LIBMVL_OFFSET64 al, bl, nn;
68  const unsigned char *ad, *bd;
71  ad=mvl_packed_list_get_entry(vec, data, i1);
72  bd=mvl_packed_list_get_entry(vec, data, i2);
73  nn=al;
74  if(bl<nn)nn=bl;
75  for(LIBMVL_OFFSET64 j=0;j<nn;j++) {
76  if(ad[j]>bd[j])return true;
77  if(ad[j]<bd[j])return false;
78  }
79  return(al>bl);
80  });
81 }
82 
83 template <class Numeric>
84 void mvl_find_ties(LIBMVL_OFFSET64 start, LIBMVL_OFFSET64 stop, Numeric *data, std::vector<std::pair<LIBMVL_OFFSET64, LIBMVL_OFFSET64>> &ties)
85 {
86 LIBMVL_OFFSET64 i,j;
87 i=0;
88 while(i<stop-start-1) {
89  if(data[i]!=data[i+1]) {
90  i++;
91  continue;
92  }
93  for(j=i+2;(j<stop-start) && data[j]==data[i];j++);
94  ties.push_back(std::make_pair(i+start, j+start));
95  i=j;
96  }
97 }
98 
99 
100 class mvl_scratch {
101  LIBMVL_OFFSET64 length;
102  void *data0;
103  int err;
104 
105 public:
106 
107  mvl_scratch()
108  {
109  err=0;
110  data0=NULL;
111  length=0;
112  }
113 
114  int reserve(LIBMVL_OFFSET64 new_length)
115  {
116  if(new_length>length) {
117  free(data0);
118  length=new_length;
119  data0=malloc(new_length);
120  if(data0==NULL) {
121  length=0;
122  err= -1;
123  } else {
124  length=new_length;
125  }
126  }
127  return(err);
128  }
129 
130  int error(void) {
131  return(err);
132  }
133 
134  void * data(void)
135  {
136  return(data0);
137  }
138 
139  ~mvl_scratch()
140  {
141  free(data0);
142  }
143  };
144 
145 extern "C" {
146 
147 void mvl_indexed_sort_single_vector_asc(LIBMVL_OFFSET64 start, LIBMVL_OFFSET64 stop, LIBMVL_OFFSET64 *indices, LIBMVL_VECTOR *vec, void *data, mvl_scratch &scratch)
148 {
149 
150 switch(mvl_vector_type(vec)) {
151  case LIBMVL_VECTOR_UINT8:
152  case LIBMVL_VECTOR_CSTRING: {
153  if(scratch.reserve((stop-start)*mvl_element_size(mvl_vector_type(vec)))<0)break;
154  unsigned char *b=mvl_vector_data_uint8(vec);
155  unsigned char *d=(unsigned char*)scratch.data();
156  for(LIBMVL_OFFSET64 i=0;i<stop-start;i++)d[i]=b[indices[i+start]];
157  sort_indices_asc(stop-start, indices+start, d);
158  break;
159  }
160  case LIBMVL_VECTOR_INT32: {
161  if(scratch.reserve((stop-start)*mvl_element_size(mvl_vector_type(vec)))<0)break;
162  int *b=mvl_vector_data_int32(vec);
163  int *d=(int *)scratch.data();
164  for(LIBMVL_OFFSET64 i=0;i<stop-start;i++)d[i]=b[indices[i+start]];
165  sort_indices_asc(stop-start, indices+start, d);
166  break;
167  }
168  case LIBMVL_VECTOR_FLOAT: {
169  if(scratch.reserve((stop-start)*mvl_element_size(mvl_vector_type(vec)))<0)break;
170  float *b=mvl_vector_data_float(vec);
171  float *d=(float *)scratch.data();
172  for(LIBMVL_OFFSET64 i=0;i<stop-start;i++)d[i]=b[indices[i+start]];
173  sort_indices_asc(stop-start, indices+start, d);
174  break;
175  }
176  case LIBMVL_VECTOR_INT64: {
177  if(scratch.reserve((stop-start)*mvl_element_size(mvl_vector_type(vec)))<0)break;
178  long long int *b=mvl_vector_data_int64(vec);
179  long long int *d=(long long int*)scratch.data();
180  for(LIBMVL_OFFSET64 i=0;i<stop-start;i++)d[i]=b[indices[i+start]];
181  sort_indices_asc(stop-start, indices+start, d);
182  break;
183  }
184  case LIBMVL_VECTOR_OFFSET64: {
185  if(scratch.reserve((stop-start)*mvl_element_size(mvl_vector_type(vec)))<0)break;
187  LIBMVL_OFFSET64 *d=(LIBMVL_OFFSET64*)scratch.data();
188  for(LIBMVL_OFFSET64 i=0;i<stop-start;i++)d[i]=b[indices[i+start]];
189  sort_indices_asc(stop-start, indices+start, d);
190  break;
191  }
192  case LIBMVL_VECTOR_DOUBLE: {
193  if(scratch.reserve((stop-start)*mvl_element_size(mvl_vector_type(vec)))<0)break;
194  double *b=mvl_vector_data_double(vec);
195  double *d=(double *)scratch.data();
196  for(LIBMVL_OFFSET64 i=0;i<stop-start;i++)d[i]=b[indices[i+start]];
197  sort_indices_asc(stop-start, indices+start, d);
198  break;
199  }
201  sort_indices_packed_list64_asc(start, stop, indices, vec, data);
202  break;
203  default:
204  break;
205  }
206 }
207 
208 void mvl_indexed_sort_single_vector_desc(LIBMVL_OFFSET64 start, LIBMVL_OFFSET64 stop, LIBMVL_OFFSET64 *indices, LIBMVL_VECTOR *vec, void *data, mvl_scratch &scratch)
209 {
210 
211 switch(mvl_vector_type(vec)) {
212  case LIBMVL_VECTOR_UINT8:
213  case LIBMVL_VECTOR_CSTRING: {
214  if(scratch.reserve((stop-start)*mvl_element_size(mvl_vector_type(vec)))<0)break;
215  unsigned char *b=mvl_vector_data_uint8(vec);
216  unsigned char *d=(unsigned char*)scratch.data();
217  for(LIBMVL_OFFSET64 i=0;i<stop-start;i++)d[i]=b[indices[i+start]];
218  sort_indices_desc(stop-start, indices+start, d);
219  break;
220  }
221  case LIBMVL_VECTOR_INT32: {
222  if(scratch.reserve((stop-start)*mvl_element_size(mvl_vector_type(vec)))<0)break;
223  int *b=mvl_vector_data_int32(vec);
224  int *d=(int *)scratch.data();
225  for(LIBMVL_OFFSET64 i=0;i<stop-start;i++)d[i]=b[indices[i+start]];
226  sort_indices_desc(stop-start, indices+start, d);
227  break;
228  }
229  case LIBMVL_VECTOR_FLOAT: {
230  if(scratch.reserve((stop-start)*mvl_element_size(mvl_vector_type(vec)))<0)break;
231  float *b=mvl_vector_data_float(vec);
232  float *d=(float *)scratch.data();
233  for(LIBMVL_OFFSET64 i=0;i<stop-start;i++)d[i]=b[indices[i+start]];
234  sort_indices_desc(stop-start, indices+start, d);
235  break;
236  }
237  case LIBMVL_VECTOR_INT64: {
238  if(scratch.reserve((stop-start)*mvl_element_size(mvl_vector_type(vec)))<0)break;
239  long long int *b=mvl_vector_data_int64(vec);
240  long long int *d=(long long int*)scratch.data();
241  for(LIBMVL_OFFSET64 i=0;i<stop-start;i++)d[i]=b[indices[i+start]];
242  sort_indices_desc(stop-start, indices+start, d);
243  break;
244  }
245  case LIBMVL_VECTOR_OFFSET64: {
246  if(scratch.reserve((stop-start)*mvl_element_size(mvl_vector_type(vec)))<0)break;
248  LIBMVL_OFFSET64 *d=(LIBMVL_OFFSET64*)scratch.data();
249  for(LIBMVL_OFFSET64 i=0;i<stop-start;i++)d[i]=b[indices[i+start]];
250  sort_indices_desc(stop-start, indices+start, d);
251  break;
252  }
253  case LIBMVL_VECTOR_DOUBLE: {
254  if(scratch.reserve((stop-start)*mvl_element_size(mvl_vector_type(vec)))<0)break;
255  double *b=mvl_vector_data_double(vec);
256  double *d=(double *)scratch.data();
257  for(LIBMVL_OFFSET64 i=0;i<stop-start;i++)d[i]=b[indices[i+start]];
258  sort_indices_desc(stop-start, indices+start, d);
259  break;
260  }
262  sort_indices_packed_list64_desc(start, stop, indices, vec, data);
263  break;
264  default:
265  break;
266  }
267 }
268 
269 
270 static inline int mvl_packed64_equal(LIBMVL_VECTOR *vec, void *data, LIBMVL_OFFSET64 i1, LIBMVL_OFFSET64 i2)
271 {
272 LIBMVL_OFFSET64 al, bl;
273 const unsigned char *ad, *bd;
276 if(al!=bl) {
277  return 0;
278  }
279 ad=mvl_packed_list_get_entry(vec, data, i1);
280 bd=mvl_packed_list_get_entry(vec, data, i2);
281 for(LIBMVL_OFFSET64 k=0;k<al;k++) {
282  if(ad[k]!=bd[k])return 0;
283  }
284 return 1;
285 }
286 
287 void mvl_indexed_find_ties(LIBMVL_OFFSET64 start, LIBMVL_OFFSET64 stop, LIBMVL_OFFSET64 *indices, LIBMVL_VECTOR *vec, void *data, mvl_scratch &scratch, std::vector<std::pair<LIBMVL_OFFSET64, LIBMVL_OFFSET64>> &ties)
288 {
289 if(scratch.error()<0)return;
290 
291 switch(mvl_vector_type(vec)) {
292  case LIBMVL_VECTOR_UINT8:
293  case LIBMVL_VECTOR_CSTRING: {
294  unsigned char *d=(unsigned char*)scratch.data();
295  mvl_find_ties(start, stop, d, ties);
296  break;
297  }
298  case LIBMVL_VECTOR_INT32: {
299  int *d=(int *)scratch.data();
300  mvl_find_ties(start, stop, d, ties);
301  break;
302  }
303  case LIBMVL_VECTOR_FLOAT: {
304  float *d=(float *)scratch.data();
305  mvl_find_ties(start, stop, d, ties);
306  break;
307  }
308  case LIBMVL_VECTOR_INT64: {
309  long long int *d=(long long int*)scratch.data();
310  mvl_find_ties(start, stop, d, ties);
311  break;
312  }
313  case LIBMVL_VECTOR_OFFSET64: {
314  LIBMVL_OFFSET64 *d=(LIBMVL_OFFSET64*)scratch.data();
315  mvl_find_ties(start, stop, d, ties);
316  break;
317  }
318  case LIBMVL_VECTOR_DOUBLE: {
319  double *d=(double *)scratch.data();
320  mvl_find_ties(start, stop, d, ties);
321  break;
322  }
324  LIBMVL_OFFSET64 i,j;
325  i=start;
326  while(i<stop-1) {
327  if(!mvl_packed64_equal(vec, data, indices[i], indices[i+1])) {
328  i++;
329  continue;
330  }
331 
332  for(j=i+2;j<stop && mvl_packed64_equal(vec, data, indices[i], indices[j]);j++);
333  ties.push_back(std::make_pair(i, j));
334  i=j;
335  }
336  break;
337  default:
338  break;
339  }
340 }
341 
342 
343 /*
344  * This function sorts indices into a list of vectors so that the resulting permutation is ordered.
345  * The vector should all be the same length N, except LIBMVL_PACKED_LIST64 which should N+1 - this provides the same number of elements.
346  * The indices are from 0 to N-1 and can repeat.
347  *
348  * vec_data is the pointer to mapped data range where offsets point. This is needed only for vectors of type LIBMVL_PACKED_LIST64.
349  * You can set vec_data to NULL if LIBMVL_PACKED_LIST64 vectors are not present. Also entries vec_data[i] can be NULL if the corresponding vector is not of type
350  * LIBMVL_PACKED_LIST64
351  *
352  * This function return 0 on successful sort. If no vectors are supplies (vec_count==0) the indices are unchanged the sort is considered successful
353  */
354 int mvl_sort_indices(LIBMVL_OFFSET64 indices_count, LIBMVL_OFFSET64 *indices, LIBMVL_OFFSET64 vec_count, LIBMVL_VECTOR **vec, void **vec_data, int sort_function)
355 {
356 if(vec_count<1)return 0;
357 LIBMVL_OFFSET64 i, j;
358 
359 mvl_scratch scratch;
360 std::vector<std::pair<LIBMVL_OFFSET64, LIBMVL_OFFSET64>> ties1, ties2;
361 
362 
363 ties1.clear();
364 ties1.push_back(std::make_pair(0, indices_count));
365 
366 for(i=0;i<vec_count;i++) {
367  ties2.clear();
368  for(j=0;j<ties1.size();j++) {
369  switch(sort_function) {
371  mvl_indexed_sort_single_vector_asc(ties1[j].first, ties1[j].second, indices, vec[i], vec_data[i], scratch);
372  break;
374  mvl_indexed_sort_single_vector_desc(ties1[j].first, ties1[j].second, indices, vec[i], vec_data[i], scratch);
375  break;
376  default:
377  return -1;
378  }
379 
380  mvl_indexed_find_ties(ties1[j].first, ties1[j].second, indices, vec[i], vec_data[i], scratch, ties2);
381  }
382  std::swap(ties1, ties2);
383  if(ties1.size()<1)break;
384  }
385 if(scratch.error()<0)return(scratch.error());
386 
387 if(ties1.size()>0) {
388  /* Sort indices in ascending order for any remaining ties.
389  * This is important to improve locality of memory accesses */
390 
391  for(j=0;j<ties1.size();j++) {
392  pdqsort(indices+ties1[j].first, indices+ties1[j].second);
393  }
394  }
395 
396 return 0;
397 }
398 
399 }
LIBMVL_SORT_LEXICOGRAPHIC_DESC
#define LIBMVL_SORT_LEXICOGRAPHIC_DESC
Definition: libMVL.h:855
mvl_vector_data_double
#define mvl_vector_data_double(data)
Definition: libMVL.h:527
mvl_packed_list_get_entry_bytelength
static LIBMVL_OFFSET64 mvl_packed_list_get_entry_bytelength(const LIBMVL_VECTOR *vec, LIBMVL_OFFSET64 idx)
Get length in bytes of string element idx from a packed list.
Definition: libMVL.h:794
mvl_element_size
static int mvl_element_size(int type)
Return the element size in bytes for a particular MVL type.
Definition: libMVL.h:75
mvl_packed_list_get_entry
static const unsigned char * mvl_packed_list_get_entry(const LIBMVL_VECTOR *vec, const void *data, LIBMVL_OFFSET64 idx)
Get pointer to the start of string element idx from a packed list.
Definition: libMVL.h:812
LIBMVL_VECTOR_DOUBLE
#define LIBMVL_VECTOR_DOUBLE
Definition: libMVL.h:58
mvl_vector_data_offset
#define mvl_vector_data_offset(data)
Definition: libMVL.h:528
LIBMVL_PACKED_LIST64
#define LIBMVL_PACKED_LIST64
Definition: libMVL.h:62
LIBMVL_VECTOR_FLOAT
#define LIBMVL_VECTOR_FLOAT
Definition: libMVL.h:57
mvl_vector_type
#define mvl_vector_type(data)
Return type of data from a pointer to LIBMVL_VECTOR.
Definition: libMVL.h:467
mvl_vector_data_int64
#define mvl_vector_data_int64(data)
Definition: libMVL.h:525
mvl_vector_data_uint8
#define mvl_vector_data_uint8(data)
Definition: libMVL.h:523
LIBMVL_VECTOR
LIBMVL_VECTOR is the basic unit of information storage.
Definition: libMVL.h:184
LIBMVL_SORT_LEXICOGRAPHIC
#define LIBMVL_SORT_LEXICOGRAPHIC
Definition: libMVL.h:854
LIBMVL_VECTOR_UINT8
#define LIBMVL_VECTOR_UINT8
Definition: libMVL.h:54
LIBMVL_VECTOR_INT64
#define LIBMVL_VECTOR_INT64
Definition: libMVL.h:56
LIBMVL_VECTOR_INT32
#define LIBMVL_VECTOR_INT32
Definition: libMVL.h:55
LIBMVL_VECTOR_CSTRING
#define LIBMVL_VECTOR_CSTRING
Definition: libMVL.h:60
mvl_vector_data_int32
#define mvl_vector_data_int32(data)
Definition: libMVL.h:524
libMVL.h
core libMVL functions and structures
mvl_sort_indices
int mvl_sort_indices(LIBMVL_OFFSET64 indices_count, LIBMVL_OFFSET64 *indices, LIBMVL_OFFSET64 vec_count, LIBMVL_VECTOR **vec, void **vec_data, int sort_function)
Given a table-like set of vectors of equal length arrange indices so that the columns are sorted lexi...
Definition: libMVL_sort.cc:354
LIBMVL_OFFSET64
unsigned long long LIBMVL_OFFSET64
MVL unsigned 64-bit type used for describing offsets into loaded data.
Definition: libMVL.h:98
mvl_vector_data_float
#define mvl_vector_data_float(data)
Definition: libMVL.h:526
LIBMVL_VECTOR_OFFSET64
#define LIBMVL_VECTOR_OFFSET64
Definition: libMVL.h:59