libMVL
Mappable vector library
libMVL.h
Go to the documentation of this file.
1 /* (c) Vladimir Dergachev 2019 */
2 
3 #ifndef __LIBMVL_H__
4 #define __LIBMVL_H__
5 
6 #include <stdio.h>
7 #include <math.h>
8 
9 #ifdef __cplusplus
10 extern "C" {
11 #endif
12 
17 /* Mappable Vector Library -
18  * a structured file format which can be efficiently used
19  * after read-only memory mapping, and can be appended while mapped,
20  * with versionable edits
21  */
22 
23 #define LIBMVL_SIGNATURE "MVL0"
24 #define LIBMVL_ENDIANNESS_FLAG 1.0
25 
54 #define LIBMVL_VECTOR_UINT8 1
55 #define LIBMVL_VECTOR_INT32 2
56 #define LIBMVL_VECTOR_INT64 3
57 #define LIBMVL_VECTOR_FLOAT 4
58 #define LIBMVL_VECTOR_DOUBLE 5
59 #define LIBMVL_VECTOR_OFFSET64 100
60 #define LIBMVL_VECTOR_CSTRING 101
61 
62 #define LIBMVL_PACKED_LIST64 102
63 #define LIBMVL_VECTOR_CHECKSUM 103
64 
65 
66 #define LIBMVL_VECTOR_POSTAMBLE1 1000 /* Old format using DIRECTORY_ENTRY */
67 #define LIBMVL_VECTOR_POSTAMBLE2 1001 /* New format using named list */
68 
69 
70 
75 static inline int mvl_element_size(int type)
76 {
77 switch(type) {
80  return 1;
83  return 4;
88  case LIBMVL_VECTOR_CHECKSUM:
89  return 8;
90  default:
91  return(0);
92  }
93 }
94 
95 
98 typedef unsigned long long LIBMVL_OFFSET64;
99 
102 typedef struct {
103  char signature[4];
104  float endianness;
105  unsigned int alignment;
106 
107  int reserved[13];
108  } LIBMVL_PREAMBLE;
109 
112 typedef struct {
113  LIBMVL_OFFSET64 directory;
114  int type;
115  int reserved[13];
117 
120 typedef struct {
121  LIBMVL_OFFSET64 length;
122  int type;
123  int reserved[11];
124  LIBMVL_OFFSET64 metadata;
126 
127 
132 #define LIBMVL_CHECKSUM_ALGORITHM_INTERNAL1_HASH64 1
133 
134 
139 #define LIBMVL_FULL_CHECKSUMS_DIRECTORY_KEY "MVL_FULL_CHECKSUMS"
140 
141 
144 typedef struct {
145  LIBMVL_OFFSET64 length;
146  int type;
147  int checksum_algorithm;
148  LIBMVL_OFFSET64 checksum_area_start;
149  LIBMVL_OFFSET64 checksum_area_stop;
150  LIBMVL_OFFSET64 checksum_block_size;
151  int reserved[4];
152  LIBMVL_OFFSET64 metadata;
154 
155 #ifndef MVL_STATIC_MEMBERS
156 
157 // #ifdef __SANITIZE_ADDRESS__
158 // #define MVL_STATIC_MEMBERS 0
159 // #warning "Address sanitizer active, using C11 definition of LIBMVL_VECTOR"
160 // #else
161 // #ifdef __clang__
162 // #if __has_feature(address_sanitizer)
163 // #define MVL_STATIC_MEMBERS 0
164 // #warning "Address sanitizer active, using C11 definition of LIBMVL_VECTOR"
165 // #else
166 // #define MVL_STATIC_MEMBERS 1
167 // #endif
168 // #else
169 // #define MVL_STATIC_MEMBERS 1
170 // #endif
171 // #endif
172 
173 #define MVL_STATIC_MEMBERS 1
174 #endif
175 
176 #if MVL_STATIC_MEMBERS
177 /* This short and concise definition is portable and works with older compilers.
178  * However, when the code is instrumented with an address sanitizer it chokes on it
179  * thinking that data arrays are smaller than they are.
180  */
181 
184 typedef struct {
185  LIBMVL_VECTOR_HEADER header;
186  union {
187  unsigned char b[8];
188  int i[2];
189  long long i64[1];
190  float f[2];
191  double d[1];
192  LIBMVL_OFFSET64 offset[1];
193  } u;
194  } LIBMVL_VECTOR;
195 
196 #else
197 /* This requires flexible array members and unnamed structs and unions which only appear in C11 standard
198  * The complexity arises because the standard does not allow flexible array members in a union which makes it cumbersome
199  * to describe variable size payloads.
200  */
201 
202 typedef union {
203  struct {
204  LIBMVL_VECTOR_HEADER header;
205  unsigned char b[];
206  };
207  struct {
208  LIBMVL_VECTOR_HEADER header1;
209  int i[];
210  };
211  struct {
212  LIBMVL_VECTOR_HEADER header2;
213  long long i64[];
214  };
215  struct {
216  LIBMVL_VECTOR_HEADER header3;
217  float f[];
218  };
219  struct {
220  LIBMVL_VECTOR_HEADER header4;
221  double d[];
222  };
223  struct {
224  LIBMVL_VECTOR_HEADER header5;
225  LIBMVL_OFFSET64 offset[];
226  };
227  } LIBMVL_VECTOR;
228 #endif
229 
230 
236 typedef struct {
237  long size;
238  long free;
239  LIBMVL_OFFSET64 *offset;
240  unsigned char **tag;
241  long *tag_length;
242 
243  /* Optional hash table */
244 
245  long *next_item;
246  long *first_item;
247  LIBMVL_OFFSET64 hash_size;
249 
250 
255 typedef struct {
256  int alignment;
257  int error;
258 
259  LIBMVL_NAMED_LIST *directory;
260  LIBMVL_OFFSET64 directory_offset;
261  LIBMVL_OFFSET64 full_checksums_offset;
262 
263  LIBMVL_NAMED_LIST *cached_strings;
264 
265  LIBMVL_OFFSET64 character_class_offset;
266 
267  FILE *f;
268 
269  unsigned char *data;
270  LIBMVL_OFFSET64 data_size;
271 
272 
273  LIBMVL_PREAMBLE tmp_preamble;
274  LIBMVL_POSTAMBLE tmp_postamble;
275  LIBMVL_VECTOR_HEADER tmp_vh;
276 
277  int abort_on_error;
278  int flags;
279 
280  } LIBMVL_CONTEXT;
281 
285 #define MVL_CONTEXT_DATA(ctx) (ctx->data)
286 
290 #define MVL_CONTEXT_DATA_SIZE(ctx) (ctx->data_size)
291 
292 
293 #define LIBMVL_CTX_FLAG_HAVE_POSIX_FALLOCATE (1<<0)
294 #define LIBMVL_CTX_FLAG_HAVE_FTELLO (1<<1)
295 
296 #define LIBMVL_ERR_FAIL_PREAMBLE -1
297 #define LIBMVL_ERR_FAIL_POSTAMBLE -2
298 #define LIBMVL_ERR_UNKNOWN_TYPE -3
299 #define LIBMVL_ERR_FAIL_VECTOR -4
300 #define LIBMVL_ERR_INCOMPLETE_WRITE -5
301 #define LIBMVL_ERR_INVALID_SIGNATURE -6
302 #define LIBMVL_ERR_WRONG_ENDIANNESS -7
303 #define LIBMVL_ERR_EMPTY_DIRECTORY -8
304 #define LIBMVL_ERR_INVALID_DIRECTORY -9
305 #define LIBMVL_ERR_FTELL -10
306 #define LIBMVL_ERR_CORRUPT_POSTAMBLE -11
307 #define LIBMVL_ERR_INVALID_ATTR_LIST -12
308 #define LIBMVL_ERR_INVALID_OFFSET -13
309 #define LIBMVL_ERR_INVALID_ATTR -14
310 #define LIBMVL_ERR_CANNOT_SEEK -15
311 #define LIBMVL_ERR_INVALID_PARAMETER -16
312 #define LIBMVL_ERR_INVALID_LENGTH -17
313 #define LIBMVL_ERR_INVALID_EXTENT_INDEX -18
314 #define LIBMVL_ERR_CORRUPT_PACKED_LIST -19
315 #define LIBMVL_ERR_UNALIGNED_POINTER -20
316 #define LIBMVL_ERR_UNALIGNED_OFFSET -21
317 #define LIBMVL_ERR_INVALID_HEADER -22
318 #define LIBMVL_ERR_UNKNOWN_CHECKSUM_ALGORITHM -23
319 #define LIBMVL_ERR_CHECKSUM_FAILED -24
320 #define LIBMVL_ERR_NO_CHECKSUMS -25
321 #define LIBMVL_ERR_NO_DATA -26
322 #define LIBMVL_ERR_MVL_FILE_TOO_SHORT -27
323 
326 
327 
332 static inline int mvl_get_error(LIBMVL_CONTEXT *ctx)
333 {
334 return ctx->error;
335 }
336 
341 static inline void mvl_clear_error(LIBMVL_CONTEXT *ctx)
342 {
343 ctx->error=0;
344 }
345 
346 const char * mvl_strerror(LIBMVL_CONTEXT *ctx);
347 
350 #define LIBMVL_NO_METADATA 0
351 
354 #define LIBMVL_NULL_OFFSET 0
355 
356 
357 LIBMVL_OFFSET64 mvl_write_vector(LIBMVL_CONTEXT *ctx, int type, LIBMVL_OFFSET64 length, const void *data, LIBMVL_OFFSET64 metadata);
358 
359 /* This is identical to mvl_write_vector() except that it allows to reserve space for more data than is supplied. */
360 LIBMVL_OFFSET64 mvl_start_write_vector(LIBMVL_CONTEXT *ctx, int type, LIBMVL_OFFSET64 expected_length, LIBMVL_OFFSET64 length, const void *data, LIBMVL_OFFSET64 metadata);
361 /* Rewrite data in already written vector with offset base_offset */
362 /* In particular this allows vectors to be built up in pieces, by calling mvl_start_write_vector first */
363 void mvl_rewrite_vector(LIBMVL_CONTEXT *ctx, int type, LIBMVL_OFFSET64 base_offset, LIBMVL_OFFSET64 idx, long length, const void *data);
364 
365 
366 LIBMVL_OFFSET64 mvl_write_concat_vectors(LIBMVL_CONTEXT *ctx, int type, long nvec, const long *lengths, void **data, LIBMVL_OFFSET64 metadata);
367 
368 /* This computes vector vec[index]
369  * Indices do not have to be distinct
370  * max_buffer is the maximum length of internal buffers in bytes (two buffers are needed for LIBMVL_PACKED_LIST64 vectors)
371  */
372 LIBMVL_OFFSET64 mvl_indexed_copy_vector(LIBMVL_CONTEXT *ctx, LIBMVL_OFFSET64 index_count, const LIBMVL_OFFSET64 *indices, const LIBMVL_VECTOR *vec, const void *data, LIBMVL_OFFSET64 data_length, LIBMVL_OFFSET64 metadata, LIBMVL_OFFSET64 max_buffer);
373 
374 
375 /* Writes a single C string. In particular, this is handy for providing metadata tags */
376 /* length can be specified as -1 to be computed automatically */
377 LIBMVL_OFFSET64 mvl_write_string(LIBMVL_CONTEXT *ctx, long length, const char *data, LIBMVL_OFFSET64 metadata);
378 
379 /* A cached version of the above that assures the string is only written once. No metadata because the strings are reused */
380 LIBMVL_OFFSET64 mvl_write_cached_string(LIBMVL_CONTEXT *ctx, long length, const char *data);
381 
382 /* Create a packed list of strings
383  * str_size can be either NULL or provide string length, some of which can be -1
384  */
385 LIBMVL_OFFSET64 mvl_write_packed_list(LIBMVL_CONTEXT *ctx, long count, const long *str_size, unsigned char **str, LIBMVL_OFFSET64 metadata);
386 
387 /* Compute and write checksum vector */
388 LIBMVL_OFFSET64 mvl_write_hash64_checksum_vector(LIBMVL_CONTEXT *ctx, void *base, LIBMVL_OFFSET64 checksum_area_start, LIBMVL_OFFSET64 checksum_area_stop, LIBMVL_OFFSET64 checksum_block_size);
389 
390 /* Verify checksum for a given mapped area, could be just a portion of LIBMVL_VECTOR */
391 int mvl_verify_checksum_vector(LIBMVL_CONTEXT *ctx, const LIBMVL_VECTOR *checksum_vector, void *data, LIBMVL_OFFSET64 data_size, LIBMVL_OFFSET64 start, LIBMVL_OFFSET64 stop);
392 /* Verify all area covered by checksums */
393 int mvl_verify_full_checksum_vector(LIBMVL_CONTEXT *ctx, const LIBMVL_VECTOR *checksum_vector, void *data, LIBMVL_OFFSET64 data_size);
394 /* Verify a single LIBMVL_VECTOR */
395 int mvl_verify_checksum_vector2(LIBMVL_CONTEXT *ctx, const LIBMVL_VECTOR *checksum_vector, void *data, LIBMVL_OFFSET64 data_size, LIBMVL_OFFSET64 vector_offset);
396 /* Verify checksum for a given mapped area between pointers */
397 int mvl_verify_checksum_vector3(LIBMVL_CONTEXT *ctx, const LIBMVL_VECTOR *checksum_vector, void *data, LIBMVL_OFFSET64 data_size, void *start, void * stop);
398 
399 /* This is convenient for writing several values of the same type as vector without allocating a temporary array.
400  * This function creates the array internally using alloca().
401  */
402 LIBMVL_OFFSET64 mvl_write_vector_inline(LIBMVL_CONTEXT *ctx, int type, int count, LIBMVL_OFFSET64 metadata, ...);
403 
404 #define MVL_NUMARGS(...) (sizeof((int[]){__VA_ARGS__})/sizeof(int))
405 
406 
412 #define MVL_WVEC(ctx, type, ...) mvl_write_vector_inline(ctx, type, MVL_NUMARGS(__VA_ARGS__), 0, __VA_ARGS__)
413 
414 
415 void mvl_add_directory_entry(LIBMVL_CONTEXT *ctx, LIBMVL_OFFSET64 offset, const char *tag);
416 void mvl_add_directory_entry_n(LIBMVL_CONTEXT *ctx, LIBMVL_OFFSET64 offset, const char *tag, LIBMVL_OFFSET64 tag_size);
418 
421 
422 /* By default named lists are created by mvl_create_named_list() without a hash table, to make adding elements faster
423  * Calling this function creates the hash table.
424  * Note that functions reading lists from MVL files create hash table automatically.
425  */
427 
428 long mvl_add_list_entry(LIBMVL_NAMED_LIST *L, long tag_length, const char *tag, LIBMVL_OFFSET64 offset);
429 LIBMVL_OFFSET64 mvl_find_list_entry(LIBMVL_NAMED_LIST *L, long tag_length, const char *tag);
431 
432 /* This is meant to operate on memory mapped (or in-memory) files */
433 LIBMVL_NAMED_LIST *mvl_read_attributes_list(LIBMVL_CONTEXT *ctx, const void *data, LIBMVL_OFFSET64 data_size, LIBMVL_OFFSET64 metadata_offset);
434 
435 /* Convenience function that create a named list populated with necessary entries
436  * It needs writable context to write attribute values */
438 
439 /* Convenience function that returns an offset to attributes describing R-style character vector
440  * The attributes are written out during the first call to this function
441  */
443 
444 
445 /* This function writes contents of named list and creates R-compatible metadata with entry names */
448 
449 /* This convenience function writes named list of vectors as R-compatible data frame.
450  * A well formatted data frame would have vectors of the same length specified as nrows
451  * Assuring validity is up to the caller
452  *
453  * rownames specifies an offset of optional row names of the data frame. Set as 0 to omit.
454  */
456 
457 /* This is meant to operate on memory mapped (or in-memory) files */
458 LIBMVL_NAMED_LIST *mvl_read_named_list(LIBMVL_CONTEXT *ctx, const void *data, LIBMVL_OFFSET64 data_size, LIBMVL_OFFSET64 offset);
459 
460 void mvl_open(LIBMVL_CONTEXT *ctx, FILE *f);
461 void mvl_close(LIBMVL_CONTEXT *ctx);
462 void mvl_write_preamble(LIBMVL_CONTEXT *ctx);
463 void mvl_write_postamble(LIBMVL_CONTEXT *ctx);
464 
467 #define mvl_vector_type(data) (((LIBMVL_VECTOR_HEADER *)(data))->type)
468 
471 #define mvl_vector_length(data) (((LIBMVL_VECTOR_HEADER *)(data))->length)
472 
476 static inline LIBMVL_OFFSET64 mvl_vector_nentries(void *vec)
477 {
480 N=mvl_vector_length(vec0);
481 return((mvl_vector_type(vec0)==LIBMVL_PACKED_LIST64) ? N-1 : N);
482 }
483 
484 #if MVL_STATIC_MEMBERS
485 
494 #define mvl_vector_data(data) ((((LIBMVL_VECTOR *)(data))->u))
495 #else
496 #define mvl_vector_data(data) (*(((LIBMVL_VECTOR *)(data))))
497 #endif
498 
499 
523 #define mvl_vector_data_uint8(data) ((unsigned char *)(((const char *) data)+sizeof(LIBMVL_VECTOR_HEADER)))
524 #define mvl_vector_data_int32(data) ((int *)(((const char *) data)+sizeof(LIBMVL_VECTOR_HEADER)))
525 #define mvl_vector_data_int64(data) ((long long int *)(((const char *) data)+sizeof(LIBMVL_VECTOR_HEADER)))
526 #define mvl_vector_data_float(data) ((float *)(((const char *) data)+sizeof(LIBMVL_VECTOR_HEADER)))
527 #define mvl_vector_data_double(data) ((double *)(((const char *) data)+sizeof(LIBMVL_VECTOR_HEADER)))
528 #define mvl_vector_data_offset(data) ((LIBMVL_OFFSET64 *)(((const char *) data)+sizeof(LIBMVL_VECTOR_HEADER)))
529 
532 #define mvl_vector_metadata_offset(data) ((((LIBMVL_VECTOR_HEADER *)(data))->metadata))
533 
534 
541 static inline int mvl_validate_vector(LIBMVL_OFFSET64 offset, const void *data, LIBMVL_OFFSET64 data_size) {
542 LIBMVL_VECTOR *vec;
543 int element_size;
544 if(offset+sizeof(LIBMVL_VECTOR_HEADER)>data_size)return(LIBMVL_ERR_INVALID_OFFSET);
545 vec=(LIBMVL_VECTOR *)&(((unsigned char *)data)[offset]);
546 
547 if(!(element_size=mvl_element_size(mvl_vector_type(vec))))return LIBMVL_ERR_UNKNOWN_TYPE;
548 
549 if(offset+sizeof(LIBMVL_VECTOR_HEADER)+mvl_vector_length(vec)*element_size>data_size)return(LIBMVL_ERR_INVALID_LENGTH);
550 
552  /* We check the first and last pointer of the packed list, as checking all the entries is inefficient
553  * A valid packed list will have all entries in increasing order, which is easy to check at the point of use
554  */
555  LIBMVL_OFFSET64 offset2=mvl_vector_data_offset(vec)[0];
556  LIBMVL_VECTOR *vec2;
557  if(offset2 < sizeof(LIBMVL_VECTOR_HEADER) || offset2>data_size)return(LIBMVL_ERR_INVALID_OFFSET);
558 
559  vec2=(LIBMVL_VECTOR *)&(((unsigned char *)data)[offset2-sizeof(LIBMVL_VECTOR_HEADER)]);
560 
561  if(mvl_vector_type(vec2)!=LIBMVL_VECTOR_UINT8)return(LIBMVL_ERR_UNKNOWN_TYPE);
562  if(offset2+mvl_vector_length(vec2)>data_size)return(LIBMVL_ERR_INVALID_LENGTH);
563 
564  if(mvl_vector_data_offset(vec)[mvl_vector_length(vec)-1]>offset2+mvl_vector_length(vec2))return(LIBMVL_ERR_INVALID_OFFSET);
565 
566  return(0);
567  }
568 
569 return(0);
570 }
571 
576 static inline int mvl_validate_vector2(LIBMVL_CONTEXT *ctx, LIBMVL_OFFSET64 offset) {
578 }
579 
588 static inline LIBMVL_VECTOR * mvl_vector_from_offset(void *data, LIBMVL_OFFSET64 offset)
589 {
590 return(offset==0 ? NULL : (LIBMVL_VECTOR *)(&(((unsigned char*)data)[offset])));
591 }
592 
602 {
603 return(((offset==0) || (mvl_validate_vector(offset, data, data_size)!=0)) ? NULL : (LIBMVL_VECTOR *)(&(((unsigned char*)data)[offset])));
604 }
605 
606 
607 /* These two convenience functions are meant for retrieving a few values, such as stored configuration parameters.
608  * Only floating point and offset values are supported as output because they have intrinsic notion of invalid value.
609  */
610 
620 static inline double mvl_as_double(const LIBMVL_VECTOR *vec, long idx)
621 {
622 if((idx<0) || (idx>=mvl_vector_length(vec)))return(NAN);
623 
624 switch(mvl_vector_type(vec)) {
626  return(mvl_vector_data_double(vec)[idx]);
627  case LIBMVL_VECTOR_FLOAT:
628  return(mvl_vector_data_float(vec)[idx]);
629  case LIBMVL_VECTOR_INT64:
630  return(mvl_vector_data_int64(vec)[idx]);
631  case LIBMVL_VECTOR_INT32:
632  return(mvl_vector_data_int32(vec)[idx]);
633  default:
634  return(NAN);
635  }
636 }
637 
647 static inline double mvl_as_double_default(const LIBMVL_VECTOR *vec, long idx, double def)
648 {
649 if((idx<0) || (idx>=mvl_vector_length(vec)))return(def);
650 
651 switch(mvl_vector_type(vec)) {
653  return(mvl_vector_data_double(vec)[idx]);
654  case LIBMVL_VECTOR_FLOAT:
655  return(mvl_vector_data_float(vec)[idx]);
656  case LIBMVL_VECTOR_INT64:
657  return(mvl_vector_data_int64(vec)[idx]);
658  case LIBMVL_VECTOR_INT32:
659  return(mvl_vector_data_int32(vec)[idx]);
660  default:
661  return(def);
662  }
663 }
664 
674 static inline LIBMVL_OFFSET64 mvl_as_offset(const LIBMVL_VECTOR *vec, long idx)
675 {
676 if((idx<0) || (idx>=mvl_vector_length(vec)))return(0);
677 
678 switch(mvl_vector_type(vec)) {
680  return(mvl_vector_data_offset(vec)[idx]);
681  default:
682  return(0);
683  }
684 }
685 
698 static inline double mvl_named_list_get_double(LIBMVL_NAMED_LIST *L, const void *data, long tag_length, const char *tag, long idx)
699 {
700 LIBMVL_VECTOR *vec;
701 LIBMVL_OFFSET64 ofs;
702 ofs=mvl_find_list_entry(L, tag_length, tag);
703 if(ofs==0)return(NAN);
704 
705 vec=(LIBMVL_VECTOR *)&(((char *)data)[ofs]);
706 return(mvl_as_double(vec, idx));
707 }
708 
722 static inline double mvl_named_list_get_double_default(LIBMVL_NAMED_LIST *L, const void *data, long tag_length, const char *tag, long idx, double def)
723 {
724 LIBMVL_VECTOR *vec;
725 LIBMVL_OFFSET64 ofs;
726 ofs=mvl_find_list_entry(L, tag_length, tag);
727 if(ofs==0)return(def);
728 
729 vec=(LIBMVL_VECTOR *)&(((char *)data)[ofs]);
730 return(mvl_as_double_default(vec, idx, def));
731 }
732 
745 static inline LIBMVL_OFFSET64 mvl_named_list_get_offset(LIBMVL_NAMED_LIST *L, const void *data, long tag_length, const char *tag, long idx)
746 {
747 LIBMVL_VECTOR *vec;
748 LIBMVL_OFFSET64 ofs;
749 ofs=mvl_find_list_entry(L, tag_length, tag);
750 if(ofs==0)return(0);
751 
752 vec=(LIBMVL_VECTOR *)&(((char *)data)[ofs]);
753 return(mvl_as_offset(vec, idx));
754 }
755 
761 #define MVL_NA_STRING "\000\000NA"
762 #define MVL_NA_STRING_LENGTH 4
763 
764 static inline int mvl_string_is_na(const char *s, LIBMVL_OFFSET64 len)
765 {
766 if(len!=4)return 0;
767 if((s[0]==0 && s[1]==0 && s[2]=='N' && s[3]=='A'))return 1;
768 return(0);
769 }
770 
777 static inline int mvl_packed_list_is_na(const LIBMVL_VECTOR *vec, const void *data, LIBMVL_OFFSET64 idx)
778 {
779 LIBMVL_OFFSET64 start, stop, len;
780 if(mvl_vector_type(vec)!=LIBMVL_PACKED_LIST64)return 1;
781 len=mvl_vector_length(vec);
782 if((idx+1>=len) || (idx<0))return 1;
783 start=mvl_vector_data_offset(vec)[idx];
784 stop=mvl_vector_data_offset(vec)[idx+1];
785 return(mvl_string_is_na(&(((const char *)(data))[start]), stop-start));
786 }
787 
788 
795 {
796 LIBMVL_OFFSET64 start, stop, len;
797 if(mvl_vector_type(vec)!=LIBMVL_PACKED_LIST64)return -1;
798 len=mvl_vector_length(vec);
799 if((idx+1>=len) || (idx<0))return -1;
800 start=mvl_vector_data_offset(vec)[idx];
801 stop=mvl_vector_data_offset(vec)[idx+1];
802 return(stop-start);
803 }
804 
805 /* This returns char even though the underlying type can be different - we just want the pointer */
812 static inline const unsigned char * mvl_packed_list_get_entry(const LIBMVL_VECTOR *vec, const void *data, LIBMVL_OFFSET64 idx)
813 {
814 LIBMVL_OFFSET64 start, len;
815 if(mvl_vector_type(vec)!=LIBMVL_PACKED_LIST64)return NULL;
816 len=mvl_vector_length(vec);
817 if((idx+1>=len) || (idx<0))return NULL;
818 start=mvl_vector_data_offset(vec)[idx];
819 return(&(((const unsigned char *)(data))[start]));
820 }
821 
822 /* Validate packed list entry */
829 static inline int mvl_packed_list_validate_entry(const LIBMVL_VECTOR *vec, const void *data, LIBMVL_OFFSET64 data_size, LIBMVL_OFFSET64 idx)
830 {
831 LIBMVL_OFFSET64 start, stop, len;
832 if(mvl_vector_type(vec)!=LIBMVL_PACKED_LIST64)return -1;
833 len=mvl_vector_length(vec);
834 if((idx+1>=len) || (idx<0))return -2;
835 start=mvl_vector_data_offset(vec)[idx];
836 stop=mvl_vector_data_offset(vec)[idx+1];
837 if(start>data_size)return(-3);
838 if(stop>data_size)return(-4);
839 return(0);
840 }
841 
843 
844 /* This initializes context to use in-memory image of given length starting at data
845  * the image could have been loaded via fread, or memory mapped
846  */
847 void mvl_load_image(LIBMVL_CONTEXT *ctx, const void *data, LIBMVL_OFFSET64 length);
848 
854 #define LIBMVL_SORT_LEXICOGRAPHIC 1 /* Ascending */
855 #define LIBMVL_SORT_LEXICOGRAPHIC_DESC 2 /* Descending */
856 
857 /*
858  * This function sorts indices into a list of vectors so that the resulting permutation is ordered.
859  * The vector should all be the same length N, except LIBMVL_PACKED_LIST64 which should N+1 - this provides the same number of elements.
860  * The indices are from 0 to N-1 and can repeat.
861  *
862  * vec_data is the pointer to mapped data range where offsets point. This is needed only for vectors of type LIBMVL_PACKED_LIST64.
863  * You can set vec_data to NULL if LIBMVL_PACKED_LIST64 vectors are not present. Also entries vec_data[i] can be NULL if the corresponding vector is not of type
864  * LIBMVL_PACKED_LIST64
865  *
866  * This function return 0 on successful sort. If no vectors are supplies (vec_count==0) the indices are unchanged the sort is considered successful
867  */
877 int mvl_sort_indices(LIBMVL_OFFSET64 indices_count, LIBMVL_OFFSET64 *indices, LIBMVL_OFFSET64 vec_count, LIBMVL_VECTOR **vec, void **vec_data, int sort_function);
878 
879 
880 /* Hash function */
881 
882 /* This randomizes bits of 64-bit numbers. */
889 {
890  x^=x>>31;
891 x*=18397683724573214587LLU;
892  x^=x>>32;
893 x*=13397683724573242421LLU;
894  x^=x>>33;
895  return(x);
896 }
897 
898 /* 32 bit primes: 2147483629 2147483647 */
899 
900 /* Untested for randomization quality */
905 static inline unsigned mvl_randomize_bits32(unsigned x)
906 {
907  x^=x>>15;
908 x*=2354983627LLU;
909  x^=x>>14;
910 x*=2554984639LLU;
911  x^=x>>13;
912  return(x);
913 }
914 
918 #define MVL_SEED_HASH_VALUE 0xabcdef
919 
920 /* This allows to accumulate hash value from several sources.
921  * Initial x value can be anything except 0
922  */
931 static inline LIBMVL_OFFSET64 mvl_accumulate_hash64(LIBMVL_OFFSET64 x, const unsigned char *data, LIBMVL_OFFSET64 count)
932 {
934 for(i=0;i<count;i++) {
935  x=(x+data[i]);
936  x*=13397683724573242421LLU;
937  x^=x>>33;
938  }
939 return(x);
940 }
941 
942 /* This allows to accumulate hash value from several sources.
943  * Initial x values can be anything except 0.
944  * The accumulation is done in place and in parallel for 8 streams, count bytes for each stream.
945  */
946 static inline void mvl_accumulate_hash64x8(LIBMVL_OFFSET64 *x, const unsigned char *data0, const unsigned char *data1, const unsigned char *data2, const unsigned char *data3, const unsigned char *data4, const unsigned char *data5, const unsigned char *data6, const unsigned char *data7, LIBMVL_OFFSET64 count)
947 {
948 LIBMVL_OFFSET64 i, x0, x1, x2, x3, x4, x5, x6, x7;
949 
950 x0=x[0];
951 x1=x[1];
952 x2=x[2];
953 x3=x[3];
954 x4=x[4];
955 x5=x[5];
956 x6=x[6];
957 x7=x[7];
958 
959 for(i=0;i<count;i++) {
960  #define STEP(k) {\
961  x ## k=( (x ## k) +(data ## k)[i]); \
962  (x ## k)*=13397683724573242421LLU; \
963  (x ## k) ^= (x ## k)>>33; \
964  }
965  STEP(0)
966  STEP(1)
967  STEP(2)
968  STEP(3)
969  STEP(4)
970  STEP(5)
971  STEP(6)
972  STEP(7)
973  #undef STEP
974  }
975 
976 x[0]=x0;
977 x[1]=x1;
978 x[2]=x2;
979 x[3]=x3;
980 x[4]=x4;
981 x[5]=x5;
982 x[6]=x6;
983 x[7]=x7;
984 }
985 
986 
987 /* This allows to accumulate hash value from several sources.
988  * Initial x value can be anything except 0
989  *
990  * This function accumulates 32-bit signed ints by value
991  */
1002 {
1003 LIBMVL_OFFSET64 i;
1004 long long int d;
1005 unsigned *d_ext=(unsigned *)&d;
1006 for(i=0;i<count;i++) {
1007  d=data[i];
1008  x=(x + d_ext[0]);
1009  x*=13397683724573242421LLU;
1010  x^=x>>33;
1011  x=(x + d_ext[1]);
1012  x*=13397683724573242421LLU;
1013  x^=x>>33;
1014  }
1015 return(x);
1016 }
1017 
1018 /* This allows to accumulate hash value from several sources.
1019  * Initial x value can be anything except 0
1020  *
1021  * This function accumulates 64-bit signed ints by value
1022  */
1032 static inline LIBMVL_OFFSET64 mvl_accumulate_int64_hash64(LIBMVL_OFFSET64 x, const long long int *data, LIBMVL_OFFSET64 count)
1033 {
1034 LIBMVL_OFFSET64 i;
1035 long long int d;
1036 unsigned *d_ext=(unsigned *)&d;
1037 for(i=0;i<count;i++) {
1038  d=data[i];
1039  x=(x + d_ext[0]);
1040  x*=13397683724573242421LLU;
1041  x^=x>>33;
1042  x=(x + d_ext[1]);
1043  x*=13397683724573242421LLU;
1044  x^=x>>33;
1045  }
1046 return(x);
1047 }
1048 
1049 /* This allows to accumulate hash value from several sources.
1050  * Initial x value can be anything except 0
1051  *
1052  * This function accumulates 32-bit floats by value, so that a float promoted to double will have the same hash
1053  */
1065 {
1066 LIBMVL_OFFSET64 i;
1067 double d;
1068 unsigned *d_ext=(unsigned *)&d;
1069 for(i=0;i<count;i++) {
1070  d=data[i];
1071  x=(x + d_ext[0]);
1072  x*=13397683724573242421LLU;
1073  x^=x>>33;
1074  x=(x + d_ext[1]);
1075  x*=13397683724573242421LLU;
1076  x^=x>>33;
1077  }
1078 return(x);
1079 }
1080 
1081 /* This allows to accumulate hash value from several sources.
1082  * Initial x value can be anything except 0
1083  *
1084  * This function accumulates 64-bit doubles by value, so that a float promoted to double will have the same hash as original float
1085  */
1097 {
1098 LIBMVL_OFFSET64 i;
1099 double d;
1100 unsigned *d_ext=(unsigned *)&d;
1101 for(i=0;i<count;i++) {
1102  d=data[i];
1103  x=(x + d_ext[0]);
1104  x*=13397683724573242421LLU;
1105  x^=x>>33;
1106  x=(x + d_ext[1]);
1107  x*=13397683724573242421LLU;
1108  x^=x>>33;
1109  }
1110 return(x);
1111 }
1112 
1129 #define LIBMVL_ACCUMULATE_HASH 0
1130 #define LIBMVL_INIT_HASH 1
1131 #define LIBMVL_FINALIZE_HASH 2
1132 #define LIBMVL_COMPLETE_HASH (LIBMVL_INIT_HASH | LIBMVL_FINALIZE_HASH)
1133 
1134 /* This function is used to compute 64 bit hash of vector values
1135  * array hash[] is passed in and contains the result of the computation
1136  *
1137  * Integer indices are computed by value, so that 100 produces the same hash whether it is stored as INT32 or INT64.
1138  *
1139  * Floats and doubles are trickier - we can guarantee that the hash of float promoted to double is the same as the hash of the original float, but not the reverse.
1140  */
1141 int mvl_hash_indices(LIBMVL_OFFSET64 indices_count, const LIBMVL_OFFSET64 *indices, LIBMVL_OFFSET64 *hash, LIBMVL_OFFSET64 vec_count, LIBMVL_VECTOR **vec, void **vec_data, LIBMVL_OFFSET64 *vec_data_length, int flags);
1142 int mvl_hash_range(LIBMVL_OFFSET64 i0, LIBMVL_OFFSET64 i1, LIBMVL_OFFSET64 *hash, LIBMVL_OFFSET64 vec_count, LIBMVL_VECTOR **vec, void **vec_data, LIBMVL_OFFSET64 *vec_data_length, int flags);
1143 
1144 /* This structure can either be allocated by libMVL or constructed by the caller
1145  * In the latter case read comments describing size constraints
1146  *
1147  * The purpose of having index_size is to facilitate memory reuse by allocating the structure with index_size large enough to accomodate subsequent calls with different index_count
1148  */
1159 #define MVL_FLAG_OWN_HASH (1<<0)
1160 #define MVL_FLAG_OWN_HASH_MAP (1<<1)
1161 #define MVL_FLAG_OWN_FIRST (1<<2)
1162 #define MVL_FLAG_OWN_NEXT (1<<3)
1163 #define MVL_FLAG_OWN_VEC_TYPES (1<<4)
1164 
1170 typedef struct {
1181  int *vec_types;
1182  } HASH_MAP;
1183 
1184 /* Compute suggested hash map size */
1186 
1188 void mvl_free_hash_map(HASH_MAP *hash_map);
1189 
1190 /* This uses data from hm->hash[] array */
1191 void mvl_compute_hash_map(HASH_MAP *hm);
1192 
1193 /* Find count of matches between hashes of two sets.
1194  */
1196 
1197 /* Find indices of keys in set of hashes, using hash map.
1198  * Only the first matching hash is reported.
1199  * If not found the index is set to ~0 (0xfff...fff)
1200  * Output is in key_indices
1201  */
1202 void mvl_find_first_hashes(LIBMVL_OFFSET64 key_count, const LIBMVL_OFFSET64 *key_hash, LIBMVL_OFFSET64 *key_indices, HASH_MAP *hm);
1203 
1204 /* This function computes pairs of merge indices. The pairs are stored in key_match_indices[] and match_indices[].
1205  * All arrays should be provided by the caller. The size of match_indices arrays is computed with mvl_hash_match_count()
1206  * An auxiliary array key_last of length key_indices_count stores the stop before index (in terms of matches array).
1207  * In particular the total number of matches is given by key_last[key_indices_count-1]
1208  */
1209 int mvl_find_matches(LIBMVL_OFFSET64 key_indices_count, const LIBMVL_OFFSET64 *key_indices, LIBMVL_OFFSET64 key_vec_count, LIBMVL_VECTOR **key_vec, void **key_vec_data, LIBMVL_OFFSET64 *key_vec_data_length, LIBMVL_OFFSET64 *key_hash,
1210  LIBMVL_OFFSET64 indices_count, const LIBMVL_OFFSET64 *indices, LIBMVL_OFFSET64 vec_count, LIBMVL_VECTOR **vec, void **vec_data, LIBMVL_OFFSET64 *vec_data_length, HASH_MAP *hm,
1211  LIBMVL_OFFSET64 *key_last, LIBMVL_OFFSET64 pairs_size, LIBMVL_OFFSET64 *key_match_indices, LIBMVL_OFFSET64 *match_indices);
1212 
1213 /* This function transforms HASH_MAP into a list of groups.
1214  * After calling hm->hash_map is invalid, but hm->first and hm->next describe exactly identical rows
1215  */
1216 void mvl_find_groups(LIBMVL_OFFSET64 indices_count, const LIBMVL_OFFSET64 *indices, LIBMVL_OFFSET64 vec_count, LIBMVL_VECTOR **vec, void **vec_data, LIBMVL_OFFSET64 *vec_data_length, HASH_MAP *hm);
1217 
1218 
1223 typedef struct {
1227  } LIBMVL_PARTITION;
1228 
1229 void mvl_init_partition(LIBMVL_PARTITION *el);
1231 void mvl_find_repeats(LIBMVL_PARTITION *partition, LIBMVL_OFFSET64 count, LIBMVL_VECTOR **vec, void **data, LIBMVL_OFFSET64 *data_length);
1233 
1234 #ifndef LIBMVL_EXTENT_INLINE_SIZE
1235 #define LIBMVL_EXTENT_INLINE_SIZE 4
1236 #endif
1237 
1242 typedef struct {
1247  LIBMVL_OFFSET64 start_inline[LIBMVL_EXTENT_INLINE_SIZE];
1248  LIBMVL_OFFSET64 stop_inline[LIBMVL_EXTENT_INLINE_SIZE];
1250 
1254 
1260 typedef struct {
1261  LIBMVL_PARTITION partition;
1262  HASH_MAP hash_map;
1264 
1265 
1268 int mvl_compute_extent_index(LIBMVL_EXTENT_INDEX *ei, LIBMVL_OFFSET64 count, LIBMVL_VECTOR **vec, void **data, LIBMVL_OFFSET64 *data_length);
1270 int mvl_load_extent_index(LIBMVL_CONTEXT *ctx, void *data, LIBMVL_OFFSET64 data_size, LIBMVL_OFFSET64 offset, LIBMVL_EXTENT_INDEX *ei);
1271 
1277 {
1278 el->count=0;
1279 }
1280 
1281 
1289 {
1290 LIBMVL_OFFSET64 idx, count;
1291 
1292 count=ei->hash_map.hash_count;
1293 idx=ei->hash_map.hash_map[hash & (ei->hash_map.hash_map_size-1)];
1294 
1295 while(idx<count) {
1296  if(hash==ei->hash_map.hash[idx]) {
1297  if(el->count>=el->size)mvl_extend_extent_list(el, 0);
1298  el->start[el->count]=ei->partition.offset[idx];
1299  el->stop[el->count]=ei->partition.offset[idx+1];
1300  el->count++;
1301  }
1302  idx=ei->hash_map.next[idx];
1303  }
1304 }
1305 
1306 
1311 typedef struct {
1312  double max;
1313  double min;
1314  double center;
1315  double scale;
1317  double nrepeat;
1318  } LIBMVL_VEC_STATS;
1319 
1320 void mvl_compute_vec_stats(const LIBMVL_VECTOR *vec, LIBMVL_VEC_STATS *stats);
1321 /* i0 and i1 denote the range of values to normalize. This allows to process vector one buffer at a time */
1322 void mvl_normalize_vector(const LIBMVL_VECTOR *vec, const LIBMVL_VEC_STATS *stats, LIBMVL_OFFSET64 i0, LIBMVL_OFFSET64 i1, double *out);
1323 
1327 #define MVL_EXTENT_INDEX 1
1328 #define MVL_SPATIAL_INDEX1 2
1329 
1330 
1331 #ifdef __cplusplus
1332 }
1333 #endif
1334 
1335 #endif
mvl_normalize_vector
void mvl_normalize_vector(const LIBMVL_VECTOR *vec, const LIBMVL_VEC_STATS *stats, LIBMVL_OFFSET64 i0, LIBMVL_OFFSET64 i1, double *out)
normalize vector
Definition: libMVL.c:3379
LIBMVL_VEC_STATS::nrepeat
double nrepeat
number of stretches with identical elements
Definition: libMVL.h:1317
LIBMVL_VECTOR_HEADER
This structure describes the header of MVL vector. It is basically LIBMVL_VECTOR without the actual d...
Definition: libMVL.h:120
mvl_init_extent_list
void mvl_init_extent_list(LIBMVL_EXTENT_LIST *el)
Initialize freshly allocated partition structure.
Definition: libMVL.c:2964
mvl_empty_extent_list
static void mvl_empty_extent_list(LIBMVL_EXTENT_LIST *el)
Alter extent list to contain no extents without freeing memory.
Definition: libMVL.h:1276
mvl_write_named_list
LIBMVL_OFFSET64 mvl_write_named_list(LIBMVL_CONTEXT *ctx, LIBMVL_NAMED_LIST *L)
Write out named list. In R, this would be read back as list.
Definition: libMVL.c:1487
mvl_strerror
const char * mvl_strerror(LIBMVL_CONTEXT *ctx)
Obtain description of error code.
Definition: libMVL.c:217
mvl_clear_error
static void mvl_clear_error(LIBMVL_CONTEXT *ctx)
Clear error code.
Definition: libMVL.h:341
LIBMVL_PARTITION::offset
LIBMVL_OFFSET64 * offset
First extent element.
Definition: libMVL.h:1226
mvl_write_named_list_as_data_frame
LIBMVL_OFFSET64 mvl_write_named_list_as_data_frame(LIBMVL_CONTEXT *ctx, LIBMVL_NAMED_LIST *L, int nrows, LIBMVL_OFFSET64 rownames)
Write out named list in the style of R data frames. It is assumed that all entries of L are vectors w...
Definition: libMVL.c:1532
LIBMVL_EXTENT_LIST::stop
LIBMVL_OFFSET64 * stop
First element just past the extent end.
Definition: libMVL.h:1246
mvl_indexed_copy_vector
LIBMVL_OFFSET64 mvl_indexed_copy_vector(LIBMVL_CONTEXT *ctx, LIBMVL_OFFSET64 index_count, const LIBMVL_OFFSET64 *indices, const LIBMVL_VECTOR *vec, const void *data, LIBMVL_OFFSET64 data_length, LIBMVL_OFFSET64 metadata, LIBMVL_OFFSET64 max_buffer)
Write MVL vector that contains data at specific indices. The indices can repeat, and can themselves b...
Definition: libMVL.c:491
mvl_verify_full_checksum_vector
int mvl_verify_full_checksum_vector(LIBMVL_CONTEXT *ctx, const LIBMVL_VECTOR *checksum_vector, void *data, LIBMVL_OFFSET64 data_size)
Compute and verify checksums for the entire area covered by checksum vector.
Definition: libMVL.c:1052
mvl_accumulate_hash64
static LIBMVL_OFFSET64 mvl_accumulate_hash64(LIBMVL_OFFSET64 x, const unsigned char *data, LIBMVL_OFFSET64 count)
Accumulate hash from a piece of data.
Definition: libMVL.h:931
mvl_free_hash_map
void mvl_free_hash_map(HASH_MAP *hash_map)
Free allocated HASH_MAP.
Definition: libMVL.c:2529
mvl_vector_data_double
#define mvl_vector_data_double(data)
Definition: libMVL.h:527
LIBMVL_VEC_STATS::center
double center
a value in the "middle" of the vector
Definition: libMVL.h:1314
HASH_MAP::hash_map_size
LIBMVL_OFFSET64 hash_map_size
size of hash_map array, should be power of 2
Definition: libMVL.h:1174
mvl_randomize_bits64
static LIBMVL_OFFSET64 mvl_randomize_bits64(LIBMVL_OFFSET64 x)
Randomize bits of 64-bit numbers, typically after accumulating a hash value.
Definition: libMVL.h:888
mvl_named_list_get_double
static double mvl_named_list_get_double(LIBMVL_NAMED_LIST *L, const void *data, long tag_length, const char *tag, long idx)
Find an entry in a named list and return its idx value as a double.
Definition: libMVL.h:698
mvl_close
void mvl_close(LIBMVL_CONTEXT *ctx)
Write out MVL file directory and postable and close file.
Definition: libMVL.c:1735
LIBMVL_EXTENT_LIST::start
LIBMVL_OFFSET64 * start
First extent element.
Definition: libMVL.h:1245
mvl_as_double_default
static double mvl_as_double_default(const LIBMVL_VECTOR *vec, long idx, double def)
Return idx vector entry as a double, with default for missing values.
Definition: libMVL.h:647
LIBMVL_NAMED_LIST
This structure describes a named list - an array of LIBMVL_OFFSET64 entries each with a character nam...
Definition: libMVL.h:236
HASH_MAP::first_count
LIBMVL_OFFSET64 first_count
Number of valid entries in first array - this is populated by mvl_find_groups()
Definition: libMVL.h:1175
mvl_compute_hash_map_size
LIBMVL_OFFSET64 mvl_compute_hash_map_size(LIBMVL_OFFSET64 hash_count)
Compute suggested size of hash map given the number of entries to hash. Hash map size should always b...
Definition: libMVL.c:2486
mvl_find_matches
int mvl_find_matches(LIBMVL_OFFSET64 key_indices_count, const LIBMVL_OFFSET64 *key_indices, LIBMVL_OFFSET64 key_vec_count, LIBMVL_VECTOR **key_vec, void **key_vec_data, LIBMVL_OFFSET64 *key_vec_data_length, LIBMVL_OFFSET64 *key_hash, LIBMVL_OFFSET64 indices_count, const LIBMVL_OFFSET64 *indices, LIBMVL_OFFSET64 vec_count, LIBMVL_VECTOR **vec, void **vec_data, LIBMVL_OFFSET64 *vec_data_length, HASH_MAP *hm, LIBMVL_OFFSET64 *key_last, LIBMVL_OFFSET64 pairs_size, LIBMVL_OFFSET64 *key_match_indices, LIBMVL_OFFSET64 *match_indices)
Compute pairs of merge indices. This is similar to JOIN operation in SQL.
Definition: libMVL.c:2719
mvl_packed_list_get_entry_bytelength
static LIBMVL_OFFSET64 mvl_packed_list_get_entry_bytelength(const LIBMVL_VECTOR *vec, LIBMVL_OFFSET64 idx)
Get length in bytes of string element idx from a packed list.
Definition: libMVL.h:794
LIBMVL_PARTITION::count
LIBMVL_OFFSET64 count
extent has count valid elements
Definition: libMVL.h:1225
mvl_named_list_get_offset
static LIBMVL_OFFSET64 mvl_named_list_get_offset(LIBMVL_NAMED_LIST *L, const void *data, long tag_length, const char *tag, long idx)
Find an entry in a named list and return its idx value as an offset.
Definition: libMVL.h:745
mvl_element_size
static int mvl_element_size(int type)
Return the element size in bytes for a particular MVL type.
Definition: libMVL.h:75
mvl_load_image
void mvl_load_image(LIBMVL_CONTEXT *ctx, const void *data, LIBMVL_OFFSET64 length)
Initilize MVL context to operate with memory mapped area data.
Definition: libMVL.c:1787
mvl_packed_list_get_entry
static const unsigned char * mvl_packed_list_get_entry(const LIBMVL_VECTOR *vec, const void *data, LIBMVL_OFFSET64 idx)
Get pointer to the start of string element idx from a packed list.
Definition: libMVL.h:812
mvl_find_groups
void mvl_find_groups(LIBMVL_OFFSET64 indices_count, const LIBMVL_OFFSET64 *indices, LIBMVL_OFFSET64 vec_count, LIBMVL_VECTOR **vec, void **vec_data, LIBMVL_OFFSET64 *vec_data_length, HASH_MAP *hm)
This function transforms HASH_MAP into a list of groups. Similar to GROUP BY clause in SQL.
Definition: libMVL.c:2798
mvl_validated_vector_from_offset
static LIBMVL_VECTOR * mvl_validated_vector_from_offset(void *data, LIBMVL_OFFSET64 data_size, LIBMVL_OFFSET64 offset)
A convenience function to convert an offset into memory mapped data into a pointer to LIBMVL_VECTOR s...
Definition: libMVL.h:601
mvl_compute_hash_map
void mvl_compute_hash_map(HASH_MAP *hm)
Compute hash map. This assumes that hm->hash array has been populated with hm->hash_count hashes comp...
Definition: libMVL.c:2546
HASH_MAP::next
LIBMVL_OFFSET64 * next
array of next indices in each group. ~0LLU indicates end of group
Definition: libMVL.h:1179
mvl_write_concat_vectors
LIBMVL_OFFSET64 mvl_write_concat_vectors(LIBMVL_CONTEXT *ctx, int type, long nvec, const long *lengths, void **data, LIBMVL_OFFSET64 metadata)
Write complete MVL vector concatenating data from many vectors or arrays.
Definition: libMVL.c:654
mvl_load_extent_index
int mvl_load_extent_index(LIBMVL_CONTEXT *ctx, void *data, LIBMVL_OFFSET64 data_size, LIBMVL_OFFSET64 offset, LIBMVL_EXTENT_INDEX *ei)
Load extent index from memory mapped MVL file.
Definition: libMVL.c:3135
LIBMVL_VEC_STATS
Vector statistics.
Definition: libMVL.h:1311
mvl_free_partition_arrays
void mvl_free_partition_arrays(LIBMVL_PARTITION *el)
free arrays of previously allocated partition. This function does not free the structure itself.
Definition: libMVL.c:2952
LIBMVL_VEC_STATS::scale
double scale
normalization scale
Definition: libMVL.h:1315
LIBMVL_VECTOR_DOUBLE
#define LIBMVL_VECTOR_DOUBLE
Definition: libMVL.h:58
mvl_accumulate_double_hash64
static LIBMVL_OFFSET64 mvl_accumulate_double_hash64(LIBMVL_OFFSET64 x, const double *data, LIBMVL_OFFSET64 count)
Accumulate hash from an array of 64-bit floats The floats are hashed by value, not representation,...
Definition: libMVL.h:1096
mvl_allocate_hash_map
HASH_MAP * mvl_allocate_hash_map(LIBMVL_OFFSET64 max_index_count)
Create HASH_MAP structure.
Definition: libMVL.c:2505
mvl_find_list_entry
LIBMVL_OFFSET64 mvl_find_list_entry(LIBMVL_NAMED_LIST *L, long tag_length, const char *tag)
Find existing entry inside LIBMVL_NAMED_LIST. If several identically named entries exist this functio...
Definition: libMVL.c:1416
LIBMVL_POSTAMBLE
This structure is written last to close MVL file. It contains an offset to MVL directory that can be ...
Definition: libMVL.h:112
mvl_vector_length
#define mvl_vector_length(data)
Return number of elements from a pointer to LIBMVL_VECTOR.
Definition: libMVL.h:471
mvl_packed_list_validate_entry
static int mvl_packed_list_validate_entry(const LIBMVL_VECTOR *vec, const void *data, LIBMVL_OFFSET64 data_size, LIBMVL_OFFSET64 idx)
Get pointer to the start of string element idx from a packed list.
Definition: libMVL.h:829
mvl_find_directory_entry
LIBMVL_OFFSET64 mvl_find_directory_entry(LIBMVL_CONTEXT *ctx, const char *tag)
Find entry in MVL file directory.
Definition: libMVL.c:1772
HASH_MAP::vec_types
int * vec_types
Types of vectors used to produce hashes.
Definition: libMVL.h:1181
mvl_accumulate_int64_hash64
static LIBMVL_OFFSET64 mvl_accumulate_int64_hash64(LIBMVL_OFFSET64 x, const long long int *data, LIBMVL_OFFSET64 count)
Accumulate hash from an array of 64-bit integers The integers are hashed by value,...
Definition: libMVL.h:1032
mvl_extend_extent_list
void mvl_extend_extent_list(LIBMVL_EXTENT_LIST *el, LIBMVL_OFFSET64 nelem)
Increase storage of previously allocated extent list.
Definition: libMVL.c:2992
mvl_named_list_get_double_default
static double mvl_named_list_get_double_default(LIBMVL_NAMED_LIST *L, const void *data, long tag_length, const char *tag, long idx, double def)
Find an entry in a named list and return its idx value a double.
Definition: libMVL.h:722
mvl_hash_match_count
LIBMVL_OFFSET64 mvl_hash_match_count(LIBMVL_OFFSET64 key_count, const LIBMVL_OFFSET64 *key_hash, HASH_MAP *hm)
Find count of matches between hashes of two sets.
Definition: libMVL.c:2613
mvl_write_attributes_list
LIBMVL_OFFSET64 mvl_write_attributes_list(LIBMVL_CONTEXT *ctx, LIBMVL_NAMED_LIST *L)
Write out R-style attribute list.
Definition: libMVL.c:1464
mvl_write_named_list2
LIBMVL_OFFSET64 mvl_write_named_list2(LIBMVL_CONTEXT *ctx, LIBMVL_NAMED_LIST *L, char *cl)
Write out named list. In R, this would be read back as list with class attribute set to "cl".
Definition: libMVL.c:1509
mvl_add_directory_entry
void mvl_add_directory_entry(LIBMVL_CONTEXT *ctx, LIBMVL_OFFSET64 offset, const char *tag)
Add an entry to the top level directory of MVL file.
Definition: libMVL.c:1203
mvl_vector_from_offset
static LIBMVL_VECTOR * mvl_vector_from_offset(void *data, LIBMVL_OFFSET64 offset)
A convenience function to convert an offset into memory mapped data into a pointer to LIBMVL_VECTOR s...
Definition: libMVL.h:588
mvl_vector_data_offset
#define mvl_vector_data_offset(data)
Definition: libMVL.h:528
HASH_MAP
This structure is used for constructing associative maps and also for describing index groupings.
Definition: libMVL.h:1170
mvl_as_offset
static LIBMVL_OFFSET64 mvl_as_offset(const LIBMVL_VECTOR *vec, long idx)
Return idx vector entry as an offset.
Definition: libMVL.h:674
mvl_write_vector
LIBMVL_OFFSET64 mvl_write_vector(LIBMVL_CONTEXT *ctx, int type, LIBMVL_OFFSET64 length, const void *data, LIBMVL_OFFSET64 metadata)
Write complete MVL vector.
Definition: libMVL.c:337
mvl_accumulate_int32_hash64
static LIBMVL_OFFSET64 mvl_accumulate_int32_hash64(LIBMVL_OFFSET64 x, const int *data, LIBMVL_OFFSET64 count)
Accumulate hash from an array of 32-bit integers The integers are hashed by value,...
Definition: libMVL.h:1001
HASH_MAP::vec_count
LIBMVL_OFFSET64 vec_count
Number of vectors used to produce hashes.
Definition: libMVL.h:1180
mvl_free_named_list
void mvl_free_named_list(LIBMVL_NAMED_LIST *L)
Free structure for LIBMVL_NAMED_LIST.
Definition: libMVL.c:1317
mvl_write_cached_string
LIBMVL_OFFSET64 mvl_write_cached_string(LIBMVL_CONTEXT *ctx, long length, const char *data)
Write a single C string if it has not been written before, otherwise return offset to previously writ...
Definition: libMVL.c:719
LIBMVL_PACKED_LIST64
#define LIBMVL_PACKED_LIST64
Definition: libMVL.h:62
LIBMVL_PARTITION::size
LIBMVL_OFFSET64 size
Space allocated for start and stop arrays.
Definition: libMVL.h:1224
LIBMVL_VECTOR_FLOAT
#define LIBMVL_VECTOR_FLOAT
Definition: libMVL.h:57
mvl_vector_type
#define mvl_vector_type(data)
Return type of data from a pointer to LIBMVL_VECTOR.
Definition: libMVL.h:467
mvl_free_extent_index_arrays
void mvl_free_extent_index_arrays(LIBMVL_EXTENT_INDEX *ei)
free arrays of previously allocated extent list. This function does not free the structure itself.
Definition: libMVL.c:3030
mvl_write_string
LIBMVL_OFFSET64 mvl_write_string(LIBMVL_CONTEXT *ctx, long length, const char *data, LIBMVL_OFFSET64 metadata)
Write a single C string. In particular, this is handy for providing metadata tags.
Definition: libMVL.c:707
mvl_free_extent_list_arrays
void mvl_free_extent_list_arrays(LIBMVL_EXTENT_LIST *el)
free arrays of previously allocated partition. This function does not free the structure itself.
Definition: libMVL.c:2976
mvl_vector_data_int64
#define mvl_vector_data_int64(data)
Definition: libMVL.h:525
mvl_read_attributes_list
LIBMVL_NAMED_LIST * mvl_read_attributes_list(LIBMVL_CONTEXT *ctx, const void *data, LIBMVL_OFFSET64 data_size, LIBMVL_OFFSET64 metadata_offset)
Read back MVL attributes list, typically used to described metadata. This function also initialize ha...
Definition: libMVL.c:1559
mvl_accumulate_float_hash64
static LIBMVL_OFFSET64 mvl_accumulate_float_hash64(LIBMVL_OFFSET64 x, const float *data, LIBMVL_OFFSET64 count)
Accumulate hash from an array of 32-bit floats The floats are hashed by value, not representation,...
Definition: libMVL.h:1064
mvl_hash_range
int mvl_hash_range(LIBMVL_OFFSET64 i0, LIBMVL_OFFSET64 i1, LIBMVL_OFFSET64 *hash, LIBMVL_OFFSET64 vec_count, LIBMVL_VECTOR **vec, void **vec_data, LIBMVL_OFFSET64 *vec_data_length, int flags)
This function is used to compute 64 bit hash of vector values array hash[] is passed in and contains ...
Definition: libMVL.c:2395
LIBMVL_VEC_STATS::min
double min
minimum value of vector entries
Definition: libMVL.h:1313
LIBMVL_VECTOR
LIBMVL_VECTOR is the basic unit of information storage.
Definition: libMVL.h:184
HASH_MAP::hash_size
LIBMVL_OFFSET64 hash_size
size of hash, first and next arrays
Definition: libMVL.h:1173
LIBMVL_EXTENT_LIST::size
LIBMVL_OFFSET64 size
Space allocated for start and stop arrays.
Definition: libMVL.h:1243
HASH_MAP::flags
LIBMVL_OFFSET64 flags
flags describing HASH_MAP state
Definition: libMVL.h:1171
LIBMVL_VEC_STATS::max
double max
maximum value of vector entries
Definition: libMVL.h:1312
mvl_extend_partition
void mvl_extend_partition(LIBMVL_PARTITION *el, LIBMVL_OFFSET64 nelem)
Increase storage of previously allocated partition.
Definition: libMVL.c:2868
mvl_compute_vec_stats
void mvl_compute_vec_stats(const LIBMVL_VECTOR *vec, LIBMVL_VEC_STATS *stats)
Compute vector statistics, such as a bounding box.
Definition: libMVL.c:3228
mvl_verify_checksum_vector3
int mvl_verify_checksum_vector3(LIBMVL_CONTEXT *ctx, const LIBMVL_VECTOR *checksum_vector, void *data, LIBMVL_OFFSET64 data_size, void *start, void *stop)
Compute and verify checksums for a given area. It works just like mvl_verify_checksum_vector() but ta...
Definition: libMVL.c:1158
LIBMVL_CHECKSUM_VECTOR_HEADER
This structure describes the header of MVL checksum vector.
Definition: libMVL.h:144
mvl_create_named_list
LIBMVL_NAMED_LIST * mvl_create_named_list(int size)
Allocate and initialize structure for LIBMVL_NAMED_LIST.
Definition: libMVL.c:1295
mvl_write_packed_list
LIBMVL_OFFSET64 mvl_write_packed_list(LIBMVL_CONTEXT *ctx, long count, const long *str_size, unsigned char **str, LIBMVL_OFFSET64 metadata)
Write an array of strings as a packed list data type. This is convenient for storing a lot of differe...
Definition: libMVL.c:803
LIBMVL_VECTOR_UINT8
#define LIBMVL_VECTOR_UINT8
Definition: libMVL.h:54
mvl_vector_nentries
static LIBMVL_OFFSET64 mvl_vector_nentries(void *vec)
Return number of entries in the vector. Currently this is the same as mvl_vector_length() for all typ...
Definition: libMVL.h:476
mvl_get_error
static int mvl_get_error(LIBMVL_CONTEXT *ctx)
Obtain integer error code.
Definition: libMVL.h:332
LIBMVL_VECTOR_INT64
#define LIBMVL_VECTOR_INT64
Definition: libMVL.h:56
mvl_compute_extent_index
int mvl_compute_extent_index(LIBMVL_EXTENT_INDEX *ei, LIBMVL_OFFSET64 count, LIBMVL_VECTOR **vec, void **data, LIBMVL_OFFSET64 *data_length)
Compute an extent index.
Definition: libMVL.c:3064
MVL_CONTEXT_DATA
#define MVL_CONTEXT_DATA(ctx)
Definition: libMVL.h:285
LIBMVL_EXTENT_INDEX
An index into a table-like set of vectors with equal number of elements.
Definition: libMVL.h:1260
HASH_MAP::hash_count
LIBMVL_OFFSET64 hash_count
Number of valid entries in hash, hash_count < hash_size and hash_count < hash_map_size.
Definition: libMVL.h:1172
LIBMVL_VECTOR_INT32
#define LIBMVL_VECTOR_INT32
Definition: libMVL.h:55
mvl_verify_checksum_vector
int mvl_verify_checksum_vector(LIBMVL_CONTEXT *ctx, const LIBMVL_VECTOR *checksum_vector, void *data, LIBMVL_OFFSET64 data_size, LIBMVL_OFFSET64 start, LIBMVL_OFFSET64 stop)
Compute and verify checksums for a given area.
Definition: libMVL.c:951
mvl_create_context
LIBMVL_CONTEXT * mvl_create_context(void)
Create MVL context.
Definition: libMVL.c:150
LIBMVL_PARTITION
List of offsets partitioning the vector. First element is always 0, last element is vector size.
Definition: libMVL.h:1223
mvl_add_list_entry
long mvl_add_list_entry(LIBMVL_NAMED_LIST *L, long tag_length, const char *tag, LIBMVL_OFFSET64 offset)
Add entry to LIBMVL_NAMED_LIST. The entry is always appended to the end.
Definition: libMVL.c:1369
mvl_write_extent_index
LIBMVL_OFFSET64 mvl_write_extent_index(LIBMVL_CONTEXT *ctx, LIBMVL_EXTENT_INDEX *ei)
Write extent index to MVL file.
Definition: libMVL.c:3114
HASH_MAP::first
LIBMVL_OFFSET64 * first
array of indices in each group
Definition: libMVL.h:1178
HASH_MAP::hash
LIBMVL_OFFSET64 * hash
Input hashes, used by mvl_compute_hash_map()
Definition: libMVL.h:1176
mvl_hash_indices
int mvl_hash_indices(LIBMVL_OFFSET64 indices_count, const LIBMVL_OFFSET64 *indices, LIBMVL_OFFSET64 *hash, LIBMVL_OFFSET64 vec_count, LIBMVL_VECTOR **vec, void **vec_data, LIBMVL_OFFSET64 *vec_data_length, int flags)
This function is used to compute 64 bit hash of vector values array hash[] is passed in and contains ...
Definition: libMVL.c:2293
mvl_find_repeats
void mvl_find_repeats(LIBMVL_PARTITION *partition, LIBMVL_OFFSET64 count, LIBMVL_VECTOR **vec, void **data, LIBMVL_OFFSET64 *data_length)
Compute list of extents describing stretches of data with identical values.
Definition: libMVL.c:2887
LIBMVL_PREAMBLE
This structure is written at the beginning of MVL file. It contains the signature identifying MVL for...
Definition: libMVL.h:102
mvl_write_hash64_checksum_vector
LIBMVL_OFFSET64 mvl_write_hash64_checksum_vector(LIBMVL_CONTEXT *ctx, void *base, LIBMVL_OFFSET64 checksum_area_start, LIBMVL_OFFSET64 checksum_area_stop, LIBMVL_OFFSET64 checksum_block_size)
Compute and write checksums for a given area.
Definition: libMVL.c:851
mvl_as_double
static double mvl_as_double(const LIBMVL_VECTOR *vec, long idx)
Return idx vector entry as a double.
Definition: libMVL.h:620
mvl_read_named_list
LIBMVL_NAMED_LIST * mvl_read_named_list(LIBMVL_CONTEXT *ctx, const void *data, LIBMVL_OFFSET64 data_size, LIBMVL_OFFSET64 offset)
Read back MVL named list. This function also initialize hash table for fast access.
Definition: libMVL.c:1628
mvl_create_R_attributes_list
LIBMVL_NAMED_LIST * mvl_create_R_attributes_list(LIBMVL_CONTEXT *ctx, const char *R_class)
Create R-style attribute list for class given by R_class, which could be, for example,...
Definition: libMVL.c:1450
mvl_open
void mvl_open(LIBMVL_CONTEXT *ctx, FILE *f)
Prepare context for writing to file f.
Definition: libMVL.c:1726
LIBMVL_VECTOR_CSTRING
#define LIBMVL_VECTOR_CSTRING
Definition: libMVL.h:60
mvl_vector_data_int32
#define mvl_vector_data_int32(data)
Definition: libMVL.h:524
LIBMVL_CONTEXT
This structure describes MVL context - a collection of system data associated with a single MVL file.
Definition: libMVL.h:255
mvl_write_directory
LIBMVL_OFFSET64 mvl_write_directory(LIBMVL_CONTEXT *ctx)
Write out MVL file directory with entries collected so far. If this is called multiple times only the...
Definition: libMVL.c:1248
mvl_add_directory_entry_n
void mvl_add_directory_entry_n(LIBMVL_CONTEXT *ctx, LIBMVL_OFFSET64 offset, const char *tag, LIBMVL_OFFSET64 tag_size)
Add entry to the top level directory of MVL file.
Definition: libMVL.c:1227
mvl_randomize_bits32
static unsigned mvl_randomize_bits32(unsigned x)
Randomize bits of 32-bit numbers, typically after accumulating a hash value.
Definition: libMVL.h:905
mvl_init_extent_index
void mvl_init_extent_index(LIBMVL_EXTENT_INDEX *ei)
Initialize freshly allocated extent list structure.
Definition: libMVL.c:3020
mvl_get_extents
static void mvl_get_extents(LIBMVL_EXTENT_INDEX *ei, LIBMVL_OFFSET64 hash, LIBMVL_EXTENT_LIST *el)
Find extents in index corresponding to a given hash.
Definition: libMVL.h:1288
mvl_sort_indices
int mvl_sort_indices(LIBMVL_OFFSET64 indices_count, LIBMVL_OFFSET64 *indices, LIBMVL_OFFSET64 vec_count, LIBMVL_VECTOR **vec, void **vec_data, int sort_function)
Given a table-like set of vectors of equal length arrange indices so that the columns are sorted lexi...
Definition: libMVL_sort.cc:354
LIBMVL_EXTENT_LIST::count
LIBMVL_OFFSET64 count
extent has count valid elements
Definition: libMVL.h:1244
LIBMVL_VEC_STATS::average_repeat_length
double average_repeat_length
average length of stretch with identical elements
Definition: libMVL.h:1316
mvl_rewrite_vector
void mvl_rewrite_vector(LIBMVL_CONTEXT *ctx, int type, LIBMVL_OFFSET64 base_offset, LIBMVL_OFFSET64 idx, long length, const void *data)
Write more data to MVL vector that has been previously created with mvl_start_write_vector()
Definition: libMVL.c:470
mvl_packed_list_is_na
static int mvl_packed_list_is_na(const LIBMVL_VECTOR *vec, const void *data, LIBMVL_OFFSET64 idx)
Check whether packed list entry is a special string that indicates a missing value.
Definition: libMVL.h:777
mvl_validate_vector
static int mvl_validate_vector(LIBMVL_OFFSET64 offset, const void *data, LIBMVL_OFFSET64 data_size)
This function returns 0 if the offset into data points to a valid vector, or a negative error code ot...
Definition: libMVL.h:541
mvl_get_character_class_offset
LIBMVL_OFFSET64 mvl_get_character_class_offset(LIBMVL_CONTEXT *ctx)
Get offset to metadata describing R-style character class - an array of strings. This is convenient f...
Definition: libMVL.c:1187
LIBMVL_OFFSET64
unsigned long long LIBMVL_OFFSET64
MVL unsigned 64-bit type used for describing offsets into loaded data.
Definition: libMVL.h:98
mvl_free_context
void mvl_free_context(LIBMVL_CONTEXT *ctx)
Release memory associated with MVL context.
Definition: libMVL.c:190
mvl_vector_data_float
#define mvl_vector_data_float(data)
Definition: libMVL.h:526
mvl_validate_vector2
static int mvl_validate_vector2(LIBMVL_CONTEXT *ctx, LIBMVL_OFFSET64 offset)
A convenience version of mvl_validate_vector() that uses data and data_size from MVL context....
Definition: libMVL.h:576
LIBMVL_VECTOR_OFFSET64
#define LIBMVL_VECTOR_OFFSET64
Definition: libMVL.h:59
LIBMVL_EXTENT_LIST
List of extents - ranges of consequentive indices. Similar to partition, but they do not have to foll...
Definition: libMVL.h:1242
mvl_verify_checksum_vector2
int mvl_verify_checksum_vector2(LIBMVL_CONTEXT *ctx, const LIBMVL_VECTOR *checksum_vector, void *data, LIBMVL_OFFSET64 data_size, LIBMVL_OFFSET64 vector_offset)
Compute and verify checksums for the entire area occupied by given LIBMVL_VECTOR. Metadata is not che...
Definition: libMVL.c:1095
HASH_MAP::hash_map
LIBMVL_OFFSET64 * hash_map
This is an associative table mapping hash & (hash_map_size-1) into indices in the "first" array.
Definition: libMVL.h:1177
mvl_start_write_vector
LIBMVL_OFFSET64 mvl_start_write_vector(LIBMVL_CONTEXT *ctx, int type, LIBMVL_OFFSET64 expected_length, LIBMVL_OFFSET64 length, const void *data, LIBMVL_OFFSET64 metadata)
Begin write of MVL vector. This is only needed if the vector has to be written in parts,...
Definition: libMVL.c:387
mvl_recompute_named_list_hash
void mvl_recompute_named_list_hash(LIBMVL_NAMED_LIST *L)
Recompute named list hash.
Definition: libMVL.c:1332
MVL_CONTEXT_DATA_SIZE
#define MVL_CONTEXT_DATA_SIZE(ctx)
Definition: libMVL.h:290