libMVL
Mappable vector library
|
core libMVL functions and structures More...
#include <stdio.h>
#include <math.h>
Go to the source code of this file.
Classes | |
struct | LIBMVL_PREAMBLE |
This structure is written at the beginning of MVL file. It contains the signature identifying MVL format, and a means to check the endiannes of the MVL file. More... | |
struct | LIBMVL_POSTAMBLE |
This structure is written last to close MVL file. It contains an offset to MVL directory that can be used to retrieve offsets to LIBMVL_VECTOR structures stored in MVL file. More... | |
struct | LIBMVL_VECTOR_HEADER |
This structure describes the header of MVL vector. It is basically LIBMVL_VECTOR without the actual data. More... | |
struct | LIBMVL_CHECKSUM_VECTOR_HEADER |
This structure describes the header of MVL checksum vector. More... | |
struct | LIBMVL_VECTOR |
LIBMVL_VECTOR is the basic unit of information storage. More... | |
struct | LIBMVL_NAMED_LIST |
This structure describes a named list - an array of LIBMVL_OFFSET64 entries each with a character name or tag. More... | |
struct | LIBMVL_CONTEXT |
This structure describes MVL context - a collection of system data associated with a single MVL file. More... | |
struct | HASH_MAP |
This structure is used for constructing associative maps and also for describing index groupings. More... | |
struct | LIBMVL_PARTITION |
List of offsets partitioning the vector. First element is always 0, last element is vector size. More... | |
struct | LIBMVL_EXTENT_LIST |
List of extents - ranges of consequentive indices. Similar to partition, but they do not have to follow each other. More... | |
struct | LIBMVL_EXTENT_INDEX |
An index into a table-like set of vectors with equal number of elements. More... | |
struct | LIBMVL_VEC_STATS |
Vector statistics. More... | |
Macros | |
#define | LIBMVL_VECTOR_UINT8 1 |
#define | LIBMVL_VECTOR_INT32 2 |
#define | LIBMVL_VECTOR_INT64 3 |
#define | LIBMVL_VECTOR_FLOAT 4 |
#define | LIBMVL_VECTOR_DOUBLE 5 |
#define | LIBMVL_VECTOR_OFFSET64 100 |
#define | LIBMVL_VECTOR_CSTRING 101 |
#define | LIBMVL_PACKED_LIST64 102 |
#define | LIBMVL_CHECKSUM_ALGORITHM_INTERNAL1_HASH64 1 |
#define | LIBMVL_FULL_CHECKSUMS_DIRECTORY_KEY "MVL_FULL_CHECKSUMS" |
#define | MVL_CONTEXT_DATA(ctx) (ctx->data) |
#define | MVL_CONTEXT_DATA_SIZE(ctx) (ctx->data_size) |
#define | LIBMVL_NO_METADATA 0 |
Use this constant to specify that no metadata should be written. | |
#define | LIBMVL_NULL_OFFSET 0 |
Null offsets into memory mapped data are always invalid because that is where preamble is This is usually used to indicate that the offset does not point to valid data. | |
#define | MVL_WVEC(ctx, type, ...) mvl_write_vector_inline(ctx, type, MVL_NUMARGS(__VA_ARGS__), 0, __VA_ARGS__) |
#define | mvl_vector_type(data) (((LIBMVL_VECTOR_HEADER *)(data))->type) |
Return type of data from a pointer to LIBMVL_VECTOR. | |
#define | mvl_vector_length(data) (((LIBMVL_VECTOR_HEADER *)(data))->length) |
Return number of elements from a pointer to LIBMVL_VECTOR. | |
#define | mvl_vector_data(data) ((((LIBMVL_VECTOR *)(data))->u)) |
Return base data from a pointer to LIBMVL_VECTOR. More... | |
#define | mvl_vector_data_uint8(data) ((unsigned char *)(((const char *) data)+sizeof(LIBMVL_VECTOR_HEADER))) |
#define | mvl_vector_data_int32(data) ((int *)(((const char *) data)+sizeof(LIBMVL_VECTOR_HEADER))) |
#define | mvl_vector_data_int64(data) ((long long int *)(((const char *) data)+sizeof(LIBMVL_VECTOR_HEADER))) |
#define | mvl_vector_data_float(data) ((float *)(((const char *) data)+sizeof(LIBMVL_VECTOR_HEADER))) |
#define | mvl_vector_data_double(data) ((double *)(((const char *) data)+sizeof(LIBMVL_VECTOR_HEADER))) |
#define | mvl_vector_data_offset(data) ((LIBMVL_OFFSET64 *)(((const char *) data)+sizeof(LIBMVL_VECTOR_HEADER))) |
#define | mvl_vector_metadata_offset(data) ((((LIBMVL_VECTOR_HEADER *)(data))->metadata)) |
Return offset to metadata of given LIBMVL_VECTOR. | |
#define | MVL_NA_STRING "\000\000NA" |
It is convenient to be able to mark strings as missing value, similar to NaN for floating point type. In MVL this is done with the special string of length 4 consisting of two NUL characters followed by letters "NA". More... | |
#define | LIBMVL_SORT_LEXICOGRAPHIC 1 /* Ascending */ |
#define | LIBMVL_SORT_LEXICOGRAPHIC_DESC 2 /* Descending */ |
#define | MVL_SEED_HASH_VALUE 0xabcdef |
#define | LIBMVL_ACCUMULATE_HASH 0 |
Flags passed to mvl_hash_indices() and mvl_hash_range() More... | |
#define | LIBMVL_INIT_HASH 1 |
#define | LIBMVL_FINALIZE_HASH 2 |
#define | LIBMVL_COMPLETE_HASH (LIBMVL_INIT_HASH | LIBMVL_FINALIZE_HASH) |
#define | MVL_FLAG_OWN_HASH (1<<0) |
Flags describing HASH_MAP state. More... | |
#define | MVL_FLAG_OWN_HASH_MAP (1<<1) |
#define | MVL_FLAG_OWN_FIRST (1<<2) |
#define | MVL_FLAG_OWN_NEXT (1<<3) |
#define | MVL_EXTENT_INDEX 1 |
Index types. More... | |
Typedefs | |
typedef unsigned long long | LIBMVL_OFFSET64 |
MVL unsigned 64-bit type used for describing offsets into loaded data. | |
Functions | |
static int | mvl_element_size (int type) |
Return the element size in bytes for a particular MVL type. More... | |
LIBMVL_CONTEXT * | mvl_create_context (void) |
Create MVL context. More... | |
void | mvl_free_context (LIBMVL_CONTEXT *ctx) |
Release memory associated with MVL context. More... | |
static int | mvl_get_error (LIBMVL_CONTEXT *ctx) |
Obtain integer error code. More... | |
static void | mvl_clear_error (LIBMVL_CONTEXT *ctx) |
Clear error code. More... | |
const char * | mvl_strerror (LIBMVL_CONTEXT *ctx) |
Obtain description of error code. More... | |
LIBMVL_OFFSET64 | mvl_write_vector (LIBMVL_CONTEXT *ctx, int type, LIBMVL_OFFSET64 length, const void *data, LIBMVL_OFFSET64 metadata) |
Write complete MVL vector. More... | |
LIBMVL_OFFSET64 | mvl_start_write_vector (LIBMVL_CONTEXT *ctx, int type, LIBMVL_OFFSET64 expected_length, LIBMVL_OFFSET64 length, const void *data, LIBMVL_OFFSET64 metadata) |
Begin write of MVL vector. This is only needed if the vector has to be written in parts, such as due to memory constraints. More... | |
void | mvl_rewrite_vector (LIBMVL_CONTEXT *ctx, int type, LIBMVL_OFFSET64 base_offset, LIBMVL_OFFSET64 idx, long length, const void *data) |
Write more data to MVL vector that has been previously created with mvl_start_write_vector() More... | |
LIBMVL_OFFSET64 | mvl_write_concat_vectors (LIBMVL_CONTEXT *ctx, int type, long nvec, const long *lengths, void **data, LIBMVL_OFFSET64 metadata) |
Write complete MVL vector concatenating data from many vectors or arrays. More... | |
LIBMVL_OFFSET64 | mvl_indexed_copy_vector (LIBMVL_CONTEXT *ctx, LIBMVL_OFFSET64 index_count, const LIBMVL_OFFSET64 *indices, const LIBMVL_VECTOR *vec, const void *data, LIBMVL_OFFSET64 data_length, LIBMVL_OFFSET64 metadata, LIBMVL_OFFSET64 max_buffer) |
Write MVL vector that contains data at specific indices. The indices can repeat, and can themselves be stored in memory mapped MVL file. More... | |
LIBMVL_OFFSET64 | mvl_write_string (LIBMVL_CONTEXT *ctx, long length, const char *data, LIBMVL_OFFSET64 metadata) |
Write a single C string. In particular, this is handy for providing metadata tags. More... | |
LIBMVL_OFFSET64 | mvl_write_cached_string (LIBMVL_CONTEXT *ctx, long length, const char *data) |
Write a single C string if it has not been written before, otherwise return offset to previously written object. In particular, this is handy for providing metadata tags. More... | |
LIBMVL_OFFSET64 | mvl_write_packed_list (LIBMVL_CONTEXT *ctx, long count, const long *str_size, unsigned char **str, LIBMVL_OFFSET64 metadata) |
Write an array of strings as a packed list data type. This is convenient for storing a lot of different strings. More... | |
LIBMVL_OFFSET64 | mvl_write_hash64_checksum_vector (LIBMVL_CONTEXT *ctx, void *base, LIBMVL_OFFSET64 checksum_area_start, LIBMVL_OFFSET64 checksum_area_stop, LIBMVL_OFFSET64 checksum_block_size) |
Compute and write checksums for a given area. More... | |
int | mvl_verify_checksum_vector (LIBMVL_CONTEXT *ctx, const LIBMVL_VECTOR *checksum_vector, void *data, LIBMVL_OFFSET64 data_size, LIBMVL_OFFSET64 start, LIBMVL_OFFSET64 stop) |
Compute and verify checksums for a given area. More... | |
int | mvl_verify_full_checksum_vector (LIBMVL_CONTEXT *ctx, const LIBMVL_VECTOR *checksum_vector, void *data, LIBMVL_OFFSET64 data_size) |
Compute and verify checksums for the entire area covered by checksum vector. More... | |
int | mvl_verify_checksum_vector2 (LIBMVL_CONTEXT *ctx, const LIBMVL_VECTOR *checksum_vector, void *data, LIBMVL_OFFSET64 data_size, LIBMVL_OFFSET64 vector_offset) |
Compute and verify checksums for the entire area occupied by given LIBMVL_VECTOR. Metadata is not checked. More... | |
int | mvl_verify_checksum_vector3 (LIBMVL_CONTEXT *ctx, const LIBMVL_VECTOR *checksum_vector, void *data, LIBMVL_OFFSET64 data_size, void *start, void *stop) |
Compute and verify checksums for a given area. It works just like mvl_verify_checksum_vector() but takes pointers instead of offsets. More... | |
void | mvl_add_directory_entry (LIBMVL_CONTEXT *ctx, LIBMVL_OFFSET64 offset, const char *tag) |
Add an entry to the top level directory of MVL file. More... | |
void | mvl_add_directory_entry_n (LIBMVL_CONTEXT *ctx, LIBMVL_OFFSET64 offset, const char *tag, LIBMVL_OFFSET64 tag_size) |
Add entry to the top level directory of MVL file. More... | |
LIBMVL_OFFSET64 | mvl_write_directory (LIBMVL_CONTEXT *ctx) |
Write out MVL file directory with entries collected so far. If this is called multiple times only the latest written directory is retrieved when MVL file is opened. It is an error to write out an empty directory. More... | |
LIBMVL_NAMED_LIST * | mvl_create_named_list (int size) |
Allocate and initialize structure for LIBMVL_NAMED_LIST. More... | |
void | mvl_free_named_list (LIBMVL_NAMED_LIST *L) |
Free structure for LIBMVL_NAMED_LIST. More... | |
void | mvl_recompute_named_list_hash (LIBMVL_NAMED_LIST *L) |
Recompute named list hash. More... | |
long | mvl_add_list_entry (LIBMVL_NAMED_LIST *L, long tag_length, const char *tag, LIBMVL_OFFSET64 offset) |
Add entry to LIBMVL_NAMED_LIST. The entry is always appended to the end. More... | |
LIBMVL_OFFSET64 | mvl_find_list_entry (LIBMVL_NAMED_LIST *L, long tag_length, const char *tag) |
Find existing entry inside LIBMVL_NAMED_LIST. If several identically named entries exist this function returns last written value. Hash table is used if present. More... | |
LIBMVL_OFFSET64 | mvl_write_attributes_list (LIBMVL_CONTEXT *ctx, LIBMVL_NAMED_LIST *L) |
Write out R-style attribute list. More... | |
LIBMVL_NAMED_LIST * | mvl_read_attributes_list (LIBMVL_CONTEXT *ctx, const void *data, LIBMVL_OFFSET64 data_size, LIBMVL_OFFSET64 metadata_offset) |
Read back MVL attributes list, typically used to described metadata. This function also initialize hash table for fast access. This function does not check that the offsets stored in returned LIBMVL_NAMED_LIST data structure are valid, this should be done by the code that uses those offsets. More... | |
LIBMVL_NAMED_LIST * | mvl_create_R_attributes_list (LIBMVL_CONTEXT *ctx, const char *R_class) |
Create R-style attribute list for class given by R_class, which could be, for example, "data.frame". More... | |
LIBMVL_OFFSET64 | mvl_get_character_class_offset (LIBMVL_CONTEXT *ctx) |
Get offset to metadata describing R-style character class - an array of strings. This is convenient for writing columns of strings to be analyzed with R - just provide this offset as the metadata field of mvl_write_packed_list() More... | |
LIBMVL_OFFSET64 | mvl_write_named_list (LIBMVL_CONTEXT *ctx, LIBMVL_NAMED_LIST *L) |
Write out named list. In R, this would be read back as list. More... | |
LIBMVL_OFFSET64 | mvl_write_named_list2 (LIBMVL_CONTEXT *ctx, LIBMVL_NAMED_LIST *L, char *cl) |
Write out named list. In R, this would be read back as list with class attribute set to "cl". More... | |
LIBMVL_OFFSET64 | mvl_write_named_list_as_data_frame (LIBMVL_CONTEXT *ctx, LIBMVL_NAMED_LIST *L, int nrows, LIBMVL_OFFSET64 rownames) |
Write out named list in the style of R data frames. It is assumed that all entries of L are vectors with the same number of elements. More... | |
LIBMVL_NAMED_LIST * | mvl_read_named_list (LIBMVL_CONTEXT *ctx, const void *data, LIBMVL_OFFSET64 data_size, LIBMVL_OFFSET64 offset) |
Read back MVL named list. This function also initialize hash table for fast access. More... | |
void | mvl_open (LIBMVL_CONTEXT *ctx, FILE *f) |
Prepare context for writing to file f. More... | |
void | mvl_close (LIBMVL_CONTEXT *ctx) |
Write out MVL file directory and postable and close file. More... | |
static LIBMVL_OFFSET64 | mvl_vector_nentries (void *vec) |
Return number of entries in the vector. Currently this is the same as mvl_vector_length() for all types except LIBMVL_PACKED_LIST64. More... | |
static int | mvl_validate_vector (LIBMVL_OFFSET64 offset, const void *data, LIBMVL_OFFSET64 data_size) |
This function returns 0 if the offset into data points to a valid vector, or a negative error code otherwise. More... | |
static int | mvl_validate_vector2 (LIBMVL_CONTEXT *ctx, LIBMVL_OFFSET64 offset) |
A convenience version of mvl_validate_vector() that uses data and data_size from MVL context. This function returns 0 if the offset into data points to a valid vector, or a negative error code otherwise. More... | |
static LIBMVL_VECTOR * | mvl_vector_from_offset (void *data, LIBMVL_OFFSET64 offset) |
A convenience function to convert an offset into memory mapped data into a pointer to LIBMVL_VECTOR structure. More... | |
static LIBMVL_VECTOR * | mvl_validated_vector_from_offset (void *data, LIBMVL_OFFSET64 data_size, LIBMVL_OFFSET64 offset) |
A convenience function to convert an offset into memory mapped data into a pointer to LIBMVL_VECTOR structure. More... | |
static double | mvl_as_double (const LIBMVL_VECTOR *vec, long idx) |
Return idx vector entry as a double. More... | |
static double | mvl_as_double_default (const LIBMVL_VECTOR *vec, long idx, double def) |
Return idx vector entry as a double, with default for missing values. More... | |
static LIBMVL_OFFSET64 | mvl_as_offset (const LIBMVL_VECTOR *vec, long idx) |
Return idx vector entry as an offset. More... | |
static double | mvl_named_list_get_double (LIBMVL_NAMED_LIST *L, const void *data, long tag_length, const char *tag, long idx) |
Find an entry in a named list and return its idx value as a double. More... | |
static double | mvl_named_list_get_double_default (LIBMVL_NAMED_LIST *L, const void *data, long tag_length, const char *tag, long idx, double def) |
Find an entry in a named list and return its idx value a double. More... | |
static LIBMVL_OFFSET64 | mvl_named_list_get_offset (LIBMVL_NAMED_LIST *L, const void *data, long tag_length, const char *tag, long idx) |
Find an entry in a named list and return its idx value as an offset. More... | |
static int | mvl_packed_list_is_na (const LIBMVL_VECTOR *vec, const void *data, LIBMVL_OFFSET64 idx) |
Check whether packed list entry is a special string that indicates a missing value. More... | |
static LIBMVL_OFFSET64 | mvl_packed_list_get_entry_bytelength (const LIBMVL_VECTOR *vec, LIBMVL_OFFSET64 idx) |
Get length in bytes of string element idx from a packed list. More... | |
static const unsigned char * | mvl_packed_list_get_entry (const LIBMVL_VECTOR *vec, const void *data, LIBMVL_OFFSET64 idx) |
Get pointer to the start of string element idx from a packed list. More... | |
static int | mvl_packed_list_validate_entry (const LIBMVL_VECTOR *vec, const void *data, LIBMVL_OFFSET64 data_size, LIBMVL_OFFSET64 idx) |
Get pointer to the start of string element idx from a packed list. More... | |
LIBMVL_OFFSET64 | mvl_find_directory_entry (LIBMVL_CONTEXT *ctx, const char *tag) |
Find entry in MVL file directory. More... | |
void | mvl_load_image (LIBMVL_CONTEXT *ctx, const void *data, LIBMVL_OFFSET64 length) |
Initilize MVL context to operate with memory mapped area data. More... | |
int | mvl_sort_indices (LIBMVL_OFFSET64 indices_count, LIBMVL_OFFSET64 *indices, LIBMVL_OFFSET64 vec_count, LIBMVL_VECTOR **vec, void **vec_data, int sort_function) |
Given a table-like set of vectors of equal length arrange indices so that the columns are sorted lexicographically. More... | |
static LIBMVL_OFFSET64 | mvl_randomize_bits64 (LIBMVL_OFFSET64 x) |
Randomize bits of 64-bit numbers, typically after accumulating a hash value. More... | |
static unsigned | mvl_randomize_bits32 (unsigned x) |
Randomize bits of 32-bit numbers, typically after accumulating a hash value. More... | |
static LIBMVL_OFFSET64 | mvl_accumulate_hash64 (LIBMVL_OFFSET64 x, const unsigned char *data, LIBMVL_OFFSET64 count) |
Accumulate hash from a piece of data. More... | |
static LIBMVL_OFFSET64 | mvl_accumulate_int32_hash64 (LIBMVL_OFFSET64 x, const int *data, LIBMVL_OFFSET64 count) |
Accumulate hash from an array of 32-bit integers The integers are hashed by value, not representation, so one gets the same hash from value of 100 whether it is stored as 32-bits or 64-bits. More... | |
static LIBMVL_OFFSET64 | mvl_accumulate_int64_hash64 (LIBMVL_OFFSET64 x, const long long int *data, LIBMVL_OFFSET64 count) |
Accumulate hash from an array of 64-bit integers The integers are hashed by value, not representation, so one gets the same hash from value of 100 whether it is stored as 32-bits or 64-bits. More... | |
static LIBMVL_OFFSET64 | mvl_accumulate_float_hash64 (LIBMVL_OFFSET64 x, const float *data, LIBMVL_OFFSET64 count) |
Accumulate hash from an array of 32-bit floats The floats are hashed by value, not representation, so one gets the same hash from value of 100.0 whether it is stored as float or promoted to double. Note that this does not work in reverse - many doubles can be truncated to the same float. More... | |
static LIBMVL_OFFSET64 | mvl_accumulate_double_hash64 (LIBMVL_OFFSET64 x, const double *data, LIBMVL_OFFSET64 count) |
Accumulate hash from an array of 64-bit floats The floats are hashed by value, not representation, so one gets the same hash from value of 100.0 whether it is stored as float or promoted to double. Note that this does not work in reverse - many doubles can be truncated to the same float. More... | |
int | mvl_hash_indices (LIBMVL_OFFSET64 indices_count, const LIBMVL_OFFSET64 *indices, LIBMVL_OFFSET64 *hash, LIBMVL_OFFSET64 vec_count, LIBMVL_VECTOR **vec, void **vec_data, LIBMVL_OFFSET64 *vec_data_length, int flags) |
This function is used to compute 64 bit hash of vector values array hash[] is passed in and contains the result of the computation. More... | |
int | mvl_hash_range (LIBMVL_OFFSET64 i0, LIBMVL_OFFSET64 i1, LIBMVL_OFFSET64 *hash, LIBMVL_OFFSET64 vec_count, LIBMVL_VECTOR **vec, void **vec_data, LIBMVL_OFFSET64 *vec_data_length, int flags) |
This function is used to compute 64 bit hash of vector values array hash[] is passed in and contains the result of the computation. More... | |
LIBMVL_OFFSET64 | mvl_compute_hash_map_size (LIBMVL_OFFSET64 hash_count) |
Compute suggested size of hash map given the number of entries to hash. Hash map size should always be a power of 2. More... | |
HASH_MAP * | mvl_allocate_hash_map (LIBMVL_OFFSET64 max_index_count) |
Create HASH_MAP structure. More... | |
void | mvl_free_hash_map (HASH_MAP *hash_map) |
Free allocated HASH_MAP. More... | |
void | mvl_compute_hash_map (HASH_MAP *hm) |
Compute hash map. This assumes that hm->hash array has been populated with hm->hash_count hashes computed with mvl_hash_indices(). More... | |
LIBMVL_OFFSET64 | mvl_hash_match_count (LIBMVL_OFFSET64 key_count, const LIBMVL_OFFSET64 *key_hash, HASH_MAP *hm) |
Find count of matches between hashes of two sets. More... | |
int | mvl_find_matches (LIBMVL_OFFSET64 key_indices_count, const LIBMVL_OFFSET64 *key_indices, LIBMVL_OFFSET64 key_vec_count, LIBMVL_VECTOR **key_vec, void **key_vec_data, LIBMVL_OFFSET64 *key_vec_data_length, LIBMVL_OFFSET64 *key_hash, LIBMVL_OFFSET64 indices_count, const LIBMVL_OFFSET64 *indices, LIBMVL_OFFSET64 vec_count, LIBMVL_VECTOR **vec, void **vec_data, LIBMVL_OFFSET64 *vec_data_length, HASH_MAP *hm, LIBMVL_OFFSET64 *key_last, LIBMVL_OFFSET64 pairs_size, LIBMVL_OFFSET64 *key_match_indices, LIBMVL_OFFSET64 *match_indices) |
Compute pairs of merge indices. This is similar to JOIN operation in SQL. More... | |
void | mvl_find_groups (LIBMVL_OFFSET64 indices_count, const LIBMVL_OFFSET64 *indices, LIBMVL_OFFSET64 vec_count, LIBMVL_VECTOR **vec, void **vec_data, LIBMVL_OFFSET64 *vec_data_length, HASH_MAP *hm) |
This function transforms HASH_MAP into a list of groups. Similar to GROUP BY clause in SQL. More... | |
void | mvl_extend_partition (LIBMVL_PARTITION *el, LIBMVL_OFFSET64 nelem) |
Increase storage of previously allocated partition. More... | |
void | mvl_find_repeats (LIBMVL_PARTITION *partition, LIBMVL_OFFSET64 count, LIBMVL_VECTOR **vec, void **data, LIBMVL_OFFSET64 *data_length) |
Compute list of extents describing stretches of data with identical values. More... | |
void | mvl_free_partition_arrays (LIBMVL_PARTITION *el) |
free arrays of previously allocated partition. This function does not free the structure itself. More... | |
void | mvl_init_extent_list (LIBMVL_EXTENT_LIST *el) |
Initialize freshly allocated partition structure. More... | |
void | mvl_free_extent_list_arrays (LIBMVL_EXTENT_LIST *el) |
free arrays of previously allocated partition. This function does not free the structure itself. More... | |
void | mvl_extend_extent_list (LIBMVL_EXTENT_LIST *el, LIBMVL_OFFSET64 nelem) |
Increase storage of previously allocated extent list. More... | |
void | mvl_init_extent_index (LIBMVL_EXTENT_INDEX *ei) |
Initialize freshly allocated extent list structure. More... | |
void | mvl_free_extent_index_arrays (LIBMVL_EXTENT_INDEX *ei) |
free arrays of previously allocated extent list. This function does not free the structure itself. More... | |
int | mvl_compute_extent_index (LIBMVL_EXTENT_INDEX *ei, LIBMVL_OFFSET64 count, LIBMVL_VECTOR **vec, void **data, LIBMVL_OFFSET64 *data_length) |
Compute an extent index. More... | |
LIBMVL_OFFSET64 | mvl_write_extent_index (LIBMVL_CONTEXT *ctx, LIBMVL_EXTENT_INDEX *ei) |
Write extent index to MVL file. More... | |
int | mvl_load_extent_index (LIBMVL_CONTEXT *ctx, void *data, LIBMVL_OFFSET64 data_size, LIBMVL_OFFSET64 offset, LIBMVL_EXTENT_INDEX *ei) |
Load extent index from memory mapped MVL file. | |
static void | mvl_empty_extent_list (LIBMVL_EXTENT_LIST *el) |
Alter extent list to contain no extents without freeing memory. More... | |
static void | mvl_get_extents (LIBMVL_EXTENT_INDEX *ei, LIBMVL_OFFSET64 hash, LIBMVL_EXTENT_LIST *el) |
Find extents in index corresponding to a given hash. More... | |
void | mvl_compute_vec_stats (const LIBMVL_VECTOR *vec, LIBMVL_VEC_STATS *stats) |
Compute vector statistics, such as a bounding box. More... | |
void | mvl_normalize_vector (const LIBMVL_VECTOR *vec, const LIBMVL_VEC_STATS *stats, LIBMVL_OFFSET64 i0, LIBMVL_OFFSET64 i1, double *out) |
normalize vector More... | |
core libMVL functions and structures
Definition in file libMVL.h.
#define LIBMVL_ACCUMULATE_HASH 0 |
Flags passed to mvl_hash_indices() and mvl_hash_range()
Use LIBMVL_COMPLETE_HASH when computation is done in a single call, or spread out the computation over multiple calls. Initialization and finalization can also be done outside of mvl_hash_*() functions.
No initialization or finalization, just accumulate hash value
#define LIBMVL_CHECKSUM_ALGORITHM_INTERNAL1_HASH64 1 |
#define LIBMVL_COMPLETE_HASH (LIBMVL_INIT_HASH | LIBMVL_FINALIZE_HASH) |
#define LIBMVL_FINALIZE_HASH 2 |
#define LIBMVL_FULL_CHECKSUMS_DIRECTORY_KEY "MVL_FULL_CHECKSUMS" |
#define LIBMVL_INIT_HASH 1 |
#define LIBMVL_PACKED_LIST64 102 |
The main purpose of this type is to provide efficient storage for vectors of short strings. This is stored as LIBMVL_VECTOR_OFFSET64 with offset[0] pointing to the start of basic vector and subsequent offsets pointing to the start of the next string. For convenience the last entry points to the end of the last string.
Thus the number of strings in PACKED_LIST64 is length-1.
The usage of 64-bit offsets allows for arbitrarily long strings in the list, while requiring only minimal overhead for each string.
The type is separate from LIBMVL_VECTOR_OFFSET64 to facilitate automated tree traversal.
#define LIBMVL_SORT_LEXICOGRAPHIC 1 /* Ascending */ |
#define LIBMVL_SORT_LEXICOGRAPHIC_DESC 2 /* Descending */ |
#define LIBMVL_VECTOR_CSTRING 101 |
#define LIBMVL_VECTOR_DOUBLE 5 |
#define LIBMVL_VECTOR_FLOAT 4 |
#define LIBMVL_VECTOR_INT32 2 |
#define LIBMVL_VECTOR_INT64 3 |
#define LIBMVL_VECTOR_OFFSET64 100 |
#define LIBMVL_VECTOR_UINT8 1 |
#define MVL_CONTEXT_DATA | ( | ctx | ) | (ctx->data) |
Returns pointer to in-memory image of MVL file loaded with mvl_load_image()
#define MVL_CONTEXT_DATA_SIZE | ( | ctx | ) | (ctx->data_size) |
Returns size of in-memory image of MVL file loaded with mvl_load_image()
#define MVL_FLAG_OWN_FIRST (1<<2) |
#define MVL_FLAG_OWN_HASH (1<<0) |
#define MVL_FLAG_OWN_HASH_MAP (1<<1) |
#define MVL_FLAG_OWN_NEXT (1<<3) |
#define MVL_NA_STRING "\000\000NA" |
#define MVL_SEED_HASH_VALUE 0xabcdef |
#define mvl_vector_data | ( | data | ) | ((((LIBMVL_VECTOR *)(data))->u)) |
Return base data from a pointer to LIBMVL_VECTOR.
#define mvl_vector_data_double | ( | data | ) | ((double *)(((const char *) data)+sizeof(LIBMVL_VECTOR_HEADER))) |
Access DOUBLE array of LIBMVL_VECTOR
#define mvl_vector_data_float | ( | data | ) | ((float *)(((const char *) data)+sizeof(LIBMVL_VECTOR_HEADER))) |
Access FLOAT array of LIBMVL_VECTOR
#define mvl_vector_data_int32 | ( | data | ) | ((int *)(((const char *) data)+sizeof(LIBMVL_VECTOR_HEADER))) |
Access INT32 array of LIBMVL_VECTOR
#define mvl_vector_data_int64 | ( | data | ) | ((long long int *)(((const char *) data)+sizeof(LIBMVL_VECTOR_HEADER))) |
Access INT64 array of LIBMVL_VECTOR
#define mvl_vector_data_offset | ( | data | ) | ((LIBMVL_OFFSET64 *)(((const char *) data)+sizeof(LIBMVL_VECTOR_HEADER))) |
Access LIBMVL_OFFSET64 array of LIBMVL_VECTOR
#define mvl_vector_data_uint8 | ( | data | ) | ((unsigned char *)(((const char *) data)+sizeof(LIBMVL_VECTOR_HEADER))) |
Access UINT8 array of LIBMVL_VECTOR
#define MVL_WVEC | ( | ctx, | |
type, | |||
... | |||
) | mvl_write_vector_inline(ctx, type, MVL_NUMARGS(__VA_ARGS__), 0, __VA_ARGS__) |
|
inlinestatic |
Accumulate hash from an array of 64-bit floats The floats are hashed by value, not representation, so one gets the same hash from value of 100.0 whether it is stored as float or promoted to double. Note that this does not work in reverse - many doubles can be truncated to the same float.
This function allows to compute hash of data in several stages.
x | previous hash value |
data | array of 64-bit floats |
count | length of data |
|
inlinestatic |
Accumulate hash from an array of 32-bit floats The floats are hashed by value, not representation, so one gets the same hash from value of 100.0 whether it is stored as float or promoted to double. Note that this does not work in reverse - many doubles can be truncated to the same float.
This function allows to compute hash of data in several stages.
x | previous hash value |
data | array of 32-bit floats |
count | length of data |
|
inlinestatic |
|
inlinestatic |
Accumulate hash from an array of 32-bit integers The integers are hashed by value, not representation, so one gets the same hash from value of 100 whether it is stored as 32-bits or 64-bits.
This function allows to compute hash of data in several stages.
x | previous hash value |
data | array of 32-bit integers |
count | length of data |
|
inlinestatic |
Accumulate hash from an array of 64-bit integers The integers are hashed by value, not representation, so one gets the same hash from value of 100 whether it is stored as 32-bits or 64-bits.
This function allows to compute hash of data in several stages.
x | previous hash value |
data | array of 64-bit integers |
count | length of data |
void mvl_add_directory_entry | ( | LIBMVL_CONTEXT * | ctx, |
LIBMVL_OFFSET64 | offset, | ||
const char * | tag | ||
) |
Add an entry to the top level directory of MVL file.
ctx | MVL context pointer that has been initialized for writing |
offset | directory entry value - typically an offset pointing to previously written MVL object |
tag | C string describing directory entry. When necessary, these can repeat, in which case the last written entry is retrieved first. |
void mvl_add_directory_entry_n | ( | LIBMVL_CONTEXT * | ctx, |
LIBMVL_OFFSET64 | offset, | ||
const char * | tag, | ||
LIBMVL_OFFSET64 | tag_size | ||
) |
Add entry to the top level directory of MVL file.
ctx | MVL context pointer that has been initialized for writing |
offset | directory entry value - typically an offset pointing to previously written MVL object |
tag | string describing directory entry. When necessary, these can repeat, in which case the last written entry is retrieved first. |
tag_size | length of tag |
long mvl_add_list_entry | ( | LIBMVL_NAMED_LIST * | L, |
long | tag_length, | ||
const char * | tag, | ||
LIBMVL_OFFSET64 | offset | ||
) |
Add entry to LIBMVL_NAMED_LIST. The entry is always appended to the end.
L | pointer to previously allocated LIBMVL_NAMED_LIST |
tag_length | size of tag |
tag | string identifying entry - these can repeat. |
offset | 64-bit value |
HASH_MAP* mvl_allocate_hash_map | ( | LIBMVL_OFFSET64 | max_index_count | ) |
Create HASH_MAP structure.
This creates default HASH_MAP structure with all members allocated with new arrays. In some situations, such as to save memory it is possible to reuse existing arrays by specifying hm->flags appropriately. In such case, one should not use this constructor and instead create the structure manually.
max_index_count | expected number of entries to hash |
|
inlinestatic |
Return idx vector entry as a double.
This function is meant as a convenience function for retrieving a few values, such as stored configuration parameters.
vec | a pointer to LIBMVL_VECTOR |
idx | index into a vector |
|
inlinestatic |
Return idx vector entry as a double, with default for missing values.
This function is meant as a convenience function for retrieving a few values, such as stored configuration parameters.
vec | a pointer to LIBMVL_VECTOR |
idx | index into a vector |
def | default value to return in case of out of bounds indices. |
|
inlinestatic |
Return idx vector entry as an offset.
This function is meant as a convenience function for retrieving a few values, such as stored configuration parameters. Only LIBMVL_VECTOR_OFFSET64 vectors are supported
vec | a pointer to LIBMVL_VECTOR |
idx | index into a vector |
|
inlinestatic |
Clear error code.
ctx | pointer to context previously allocated with mvl_create_context() |
void mvl_close | ( | LIBMVL_CONTEXT * | ctx | ) |
int mvl_compute_extent_index | ( | LIBMVL_EXTENT_INDEX * | ei, |
LIBMVL_OFFSET64 | count, | ||
LIBMVL_VECTOR ** | vec, | ||
void ** | data, | ||
LIBMVL_OFFSET64 * | data_length | ||
) |
Compute an extent index.
ei | a pointer to extent index structure |
count | the number of LIBMVL_VECTORS considered as columns in a table |
vec | an array of pointers to LIBMVL_VECTORS considered as columns in a table |
data | an array of pointers to memory mapped areas those LIBMVL_VECTORs derive from. This allows computing hash from vectors drawn from different MVL |
void mvl_compute_hash_map | ( | HASH_MAP * | hm | ) |
Compute hash map. This assumes that hm->hash array has been populated with hm->hash_count hashes computed with mvl_hash_indices().
hm | a pointer to HASH_MAP structure |
LIBMVL_OFFSET64 mvl_compute_hash_map_size | ( | LIBMVL_OFFSET64 | hash_count | ) |
void mvl_compute_vec_stats | ( | const LIBMVL_VECTOR * | vec, |
LIBMVL_VEC_STATS * | stats | ||
) |
Compute vector statistics, such as a bounding box.
vec | a pointer to LIBMVL_VECTOR |
stats | a pointer to previously allocated LIBMVL_VEC_STATS structure |
LIBMVL_CONTEXT* mvl_create_context | ( | void | ) |
Create MVL context.
LIBMVL_NAMED_LIST* mvl_create_named_list | ( | int | size | ) |
Allocate and initialize structure for LIBMVL_NAMED_LIST.
size | this can be set to large values if the final size of named list is known |
LIBMVL_NAMED_LIST* mvl_create_R_attributes_list | ( | LIBMVL_CONTEXT * | ctx, |
const char * | R_class | ||
) |
Create R-style attribute list for class given by R_class, which could be, for example, "data.frame".
ctx | MVL context pointer that has been initialized for writing |
R_class | string identifying R class, such as "data.frame" |
|
inlinestatic |
|
inlinestatic |
void mvl_extend_extent_list | ( | LIBMVL_EXTENT_LIST * | el, |
LIBMVL_OFFSET64 | nelem | ||
) |
void mvl_extend_partition | ( | LIBMVL_PARTITION * | el, |
LIBMVL_OFFSET64 | nelem | ||
) |
LIBMVL_OFFSET64 mvl_find_directory_entry | ( | LIBMVL_CONTEXT * | ctx, |
const char * | tag | ||
) |
void mvl_find_groups | ( | LIBMVL_OFFSET64 | indices_count, |
const LIBMVL_OFFSET64 * | indices, | ||
LIBMVL_OFFSET64 | vec_count, | ||
LIBMVL_VECTOR ** | vec, | ||
void ** | vec_data, | ||
LIBMVL_OFFSET64 * | vec_data_length, | ||
HASH_MAP * | hm | ||
) |
This function transforms HASH_MAP into a list of groups. Similar to GROUP BY clause in SQL.
The original HASH_MAP describes groups of rows with identical hashes. However, there is a (remote) possibility of collision where different rows have the same hash. This function resolves this ambiguity. After calling hm->hash_map becomes invalid, but hm->first and hm->next describe exactly identical rows
indices_count | number of elements in indices array |
indices | an array of indices used to create HASH_MAP hm |
vec_count | the number of LIBMVL_VECTORS considered as columns in a table |
vec | an array of pointers to LIBMVL_VECTORS considered as columns in a table |
vec_data | an array of pointers to memory mapped areas those LIBMVL_VECTORs derive from. This allows computing hash from vectors drawn from different MVL |
vec_data_length | an array of lengths of memory mapped areas those LIBMVL_VECTORs derive from. |
hm | a previously computed (with mvl_compute_hash_map()) HASH_MAP |
LIBMVL_OFFSET64 mvl_find_list_entry | ( | LIBMVL_NAMED_LIST * | L, |
long | tag_length, | ||
const char * | tag | ||
) |
Find existing entry inside LIBMVL_NAMED_LIST. If several identically named entries exist this function returns last written value. Hash table is used if present.
L | pointer to previously allocated LIBMVL_NAMED_LIST |
tag_length | size of tag |
tag | string identifying entry - these can repeat. |
int mvl_find_matches | ( | LIBMVL_OFFSET64 | key_indices_count, |
const LIBMVL_OFFSET64 * | key_indices, | ||
LIBMVL_OFFSET64 | key_vec_count, | ||
LIBMVL_VECTOR ** | key_vec, | ||
void ** | key_vec_data, | ||
LIBMVL_OFFSET64 * | key_vec_data_length, | ||
LIBMVL_OFFSET64 * | key_hash, | ||
LIBMVL_OFFSET64 | indices_count, | ||
const LIBMVL_OFFSET64 * | indices, | ||
LIBMVL_OFFSET64 | vec_count, | ||
LIBMVL_VECTOR ** | vec, | ||
void ** | vec_data, | ||
LIBMVL_OFFSET64 * | vec_data_length, | ||
HASH_MAP * | hm, | ||
LIBMVL_OFFSET64 * | key_last, | ||
LIBMVL_OFFSET64 | pairs_size, | ||
LIBMVL_OFFSET64 * | key_match_indices, | ||
LIBMVL_OFFSET64 * | match_indices | ||
) |
Compute pairs of merge indices. This is similar to JOIN operation in SQL.
This function takes two table like sets of vectors as input. The vectors in each table set have to be of equal number of elements. We also take two index arrays specifying rows in each table set. We then find pairs of indices where the rows are identical.
The output is returned in pair of preallocated arrays key_match_indices and match_indices. The pairs are arrange in stretches of identical "key" rows. Those stretches are described by key_last array.
key_indices_count | number of entries in key_indices array |
key_indices | an array with indices into "key" table-like vector set |
key_vec_count | number of vectors in "key" table set |
key_vec | an array of vectors in "key" table set |
key_vec_data | an array of pointers to memory mapped areas those "key" vectors derive from. This allows computing hash from vectors drawn from different MVL files |
key_vec_data | an array of lengths of memory mapped areas those "key" vectors derive from. |
key_hash | an array of hashes of "key" vectors computed with mvl_hash_indices() |
indices_count | number of entries in indices array |
indices | an array with indices into "main" table-like vector set |
vec_count | number of vectors in "main" table set |
vec | an array of vectors in "main" table set |
vec_data | an array of pointers to memory mapped areas those "main" vectors derive from. This allows computing hash from vectors drawn from different MVL files |
vec_data | an array of length of memory mapped areas those "main" vectors derive from. |
hm | a previosly computed HASH_MAP of "main" table set |
key_last | this is an output array of size key_indices_count that describes stretches of matches with indentical "key" rows. Thus for "key" row i, the corresponding stretch is key_last[i-1] to key_last[i]-1 |
pairs_size | the size of allocated key_match_indices and match_indices arrays. This value can be computed with mvl_hash_match_count(). |
key_match_indices | an array of "key" indices from each pair |
match_indices | an array of "main" indices from each pair |
void mvl_find_repeats | ( | LIBMVL_PARTITION * | el, |
LIBMVL_OFFSET64 | count, | ||
LIBMVL_VECTOR ** | vec, | ||
void ** | data, | ||
LIBMVL_OFFSET64 * | data_length | ||
) |
Compute list of extents describing stretches of data with identical values.
el | pointer to previously allocated LIBMVL_PARTITION structure |
count | Number of vectors in vec |
vec | Array of vectors with identical number of elements |
data | Mapped data areas (needed to compare strings) |
data_length | Lengths of mapped data areas (needed to compare strings) |
void mvl_free_context | ( | LIBMVL_CONTEXT * | ctx | ) |
Release memory associated with MVL context.
ctx | pointer to context previously allocated with mvl_create_context() |
void mvl_free_extent_index_arrays | ( | LIBMVL_EXTENT_INDEX * | ei | ) |
free arrays of previously allocated extent list. This function does not free the structure itself.
ei | a pointer to LIBMVL_EXTENT_INDEX structure |
void mvl_free_extent_list_arrays | ( | LIBMVL_EXTENT_LIST * | el | ) |
free arrays of previously allocated partition. This function does not free the structure itself.
el | a pointer to LIBMVL_PARTITION structure |
void mvl_free_hash_map | ( | HASH_MAP * | hash_map | ) |
void mvl_free_named_list | ( | LIBMVL_NAMED_LIST * | L | ) |
Free structure for LIBMVL_NAMED_LIST.
L | pointer to previously allocated LIBMVL_NAMED_LIST |
void mvl_free_partition_arrays | ( | LIBMVL_PARTITION * | el | ) |
free arrays of previously allocated partition. This function does not free the structure itself.
el | a pointer to LIBMVL_PARTITION structure |
LIBMVL_OFFSET64 mvl_get_character_class_offset | ( | LIBMVL_CONTEXT * | ctx | ) |
Get offset to metadata describing R-style character class - an array of strings. This is convenient for writing columns of strings to be analyzed with R - just provide this offset as the metadata field of mvl_write_packed_list()
ctx | MVL context pointer that has been initialized for writing |
|
inlinestatic |
Obtain integer error code.
ctx | pointer to context previously allocated with mvl_create_context() |
|
inlinestatic |
int mvl_hash_indices | ( | LIBMVL_OFFSET64 | indices_count, |
const LIBMVL_OFFSET64 * | indices, | ||
LIBMVL_OFFSET64 * | hash, | ||
LIBMVL_OFFSET64 | vec_count, | ||
LIBMVL_VECTOR ** | vec, | ||
void ** | vec_data, | ||
LIBMVL_OFFSET64 * | vec_data_length, | ||
int | flags | ||
) |
This function is used to compute 64 bit hash of vector values array hash[] is passed in and contains the result of the computation.
Integer indices are computed by value, so that 100 produces the same hash whether it is stored as INT32 or INT64.
Floats and doubles are trickier - we can guarantee that the hash of a float promoted to a double is the same as the hash of the original float, but not the reverse.
indices_count | total number of indices |
indices | an array of indices into provided vectors |
hash | a previously allocated array of length indices_count that the computed hashes will be written into |
vec_count | the number of LIBMVL_VECTORS considered as columns in a table |
vec | an array of pointers to LIBMVL_VECTORS considered as columns in a table |
vec_data | an array of pointers to memory mapped areas those LIBMVL_VECTORs derive from. This allows computing hash from vectors drawn from different MVL files |
vec_data_length | an array of lengths of memory mapped areas those LIBMVL_VECTORs derive from. |
flags | flags specifying whether to initialize or finalize hash |
LIBMVL_OFFSET64 mvl_hash_match_count | ( | LIBMVL_OFFSET64 | key_count, |
const LIBMVL_OFFSET64 * | key_hash, | ||
HASH_MAP * | hm | ||
) |
Find count of matches between hashes of two sets.
This function is useful to find the upper limit on the number of possible matches, so one can allocate arrays for the result or plan computation in some other way.
key_count | number of key hashes |
key_hash | an array of key hashes to query |
hm | a pointer to HASH_MAP structure |
int mvl_hash_range | ( | LIBMVL_OFFSET64 | i0, |
LIBMVL_OFFSET64 | i1, | ||
LIBMVL_OFFSET64 * | hash, | ||
LIBMVL_OFFSET64 | vec_count, | ||
LIBMVL_VECTOR ** | vec, | ||
void ** | vec_data, | ||
LIBMVL_OFFSET64 * | vec_data_length, | ||
int | flags | ||
) |
This function is used to compute 64 bit hash of vector values array hash[] is passed in and contains the result of the computation.
Integer indices are computed by value, so that 100 produces the same hash whether it is stored as INT32 or INT64.
Floats and doubles are trickier - we can guarantee that the hash of a float promoted to a double is the same as the hash of the original float, but not the reverse.
i0 | starting index to hash |
i1 | first index to not hash |
hash | a previously allocated array of length (i1-i0) that the computed hashes will be written into |
vec_count | the number of LIBMVL_VECTORS considered as columns in a table |
vec | an array of pointers to LIBMVL_VECTORS considered as columns in a table |
vec_data | an array of pointers to memory mapped areas those LIBMVL_VECTORs derive from. This allows computing hash from vectors drawn from different MVL files |
vec_data_length | an array of pointers to memory mapped areas those LIBMVL_VECTORs derive from. |
flags | flags specifying whether to initialize or finalize hash |
LIBMVL_OFFSET64 mvl_indexed_copy_vector | ( | LIBMVL_CONTEXT * | ctx, |
LIBMVL_OFFSET64 | index_count, | ||
const LIBMVL_OFFSET64 * | indices, | ||
const LIBMVL_VECTOR * | vec, | ||
const void * | data, | ||
LIBMVL_OFFSET64 | data_length, | ||
LIBMVL_OFFSET64 | metadata, | ||
LIBMVL_OFFSET64 | max_buffer | ||
) |
Write MVL vector that contains data at specific indices. The indices can repeat, and can themselves be stored in memory mapped MVL file.
ctx | MVL context pointer that has been initialized for writing |
index_count | number of indices to process, this will determine the length of the new vector |
indices | array of indices into vector vec |
vec | a pointer to fully formed MVL vector, such as from mapped MVL file |
data | pointer to data of previously mapped MVL library |
data_length | length of data of previously mapped MVL library |
metadata | an optional offset to previously written metadata. Specify LIBMVL_NO_METADATA if not needed |
max_buffer | maximum size of buffer to hold in-flight data. Recommend to set to at least 10MB for efficiency. |
void mvl_init_extent_index | ( | LIBMVL_EXTENT_INDEX * | ei | ) |
Initialize freshly allocated extent list structure.
ei | a pointer to LIBMVL_EXTENT_INDEX structure |
void mvl_init_extent_list | ( | LIBMVL_EXTENT_LIST * | el | ) |
Initialize freshly allocated partition structure.
el | a pointer to LIBMVL_PARTITION structure |
void mvl_load_image | ( | LIBMVL_CONTEXT * | ctx, |
const void * | data, | ||
LIBMVL_OFFSET64 | length | ||
) |
|
inlinestatic |
Find an entry in a named list and return its idx value as a double.
This function is meant as a convenience function for retrieving a few values stored in a named list, such as stored configuration parameters. It effectively performs double indexing L[tag][idx]
L | a pointer to previously retrieved LIBMVL_NAMED_LIST |
data | a pointer to beginning of memory mapped MVL file |
tag_length | length of character tag, or -1 to compute automatically |
tag | character tag |
idx | index into the entry |
|
inlinestatic |
Find an entry in a named list and return its idx value a double.
This function is meant as a convenience function for retrieving a few values stored in a named list, such as stored configuration parameters. It effectively performs double indexing L[tag][idx]
L | a pointer to previously retrieved LIBMVL_NAMED_LIST |
data | a pointer to beginning of memory mapped MVL file |
tag_length | length of character tag, or -1 to compute automatically |
tag | character tag |
idx | index into the entry |
def | default value to return in case of errors |
|
inlinestatic |
Find an entry in a named list and return its idx value as an offset.
This function is meant as a convenience function for retrieving a few values stored in a named list, such as stored configuration parameters. It effectively performs double indexing L[tag][idx]
L | a pointer to previously retrieved LIBMVL_NAMED_LIST |
data | a pointer to beginning of memory mapped MVL file |
tag_length | length of character tag, or -1 to compute automatically |
tag | character tag |
idx | index into the entry |
void mvl_normalize_vector | ( | const LIBMVL_VECTOR * | vec, |
const LIBMVL_VEC_STATS * | stats, | ||
LIBMVL_OFFSET64 | i0, | ||
LIBMVL_OFFSET64 | i1, | ||
double * | out | ||
) |
normalize vector
This function converts numeric vectors into a normalized double precision entries. Indices i0 and i1 specify the stretch of indices to normalize. This facilitates processing of very long vectors in pieces.
vec | a pointer to LIBMVL_VECTOR |
stats | previously allocated LIBMVL_VEC_STATS structure |
i0 | start index of stretch to process |
i1 | stop index of stretch to process |
out | array of normalized entries of size i1-i0. First entry corresponds to index i0 |
void mvl_open | ( | LIBMVL_CONTEXT * | ctx, |
FILE * | f | ||
) |
|
inlinestatic |
Get pointer to the start of string element idx from a packed list.
vec | a pointer to LIBMVL_VECTOR with type LIBMVL_PACKED_LIST64 |
data | a pointer to beginning of memory mapped MVL file |
idx | entry index |
|
inlinestatic |
Get length in bytes of string element idx from a packed list.
vec | a pointer to LIBMVL_VECTOR with type LIBMVL_PACKED_LIST64 |
idx | entry index |
|
inlinestatic |
Check whether packed list entry is a special string that indicates a missing value.
vec | a pointer to LIBMVL_VECTOR with type LIBMVL_PACKED_LIST64 |
data | a pointer to beginning of memory mapped MVL file |
idx | entry index |
|
inlinestatic |
Get pointer to the start of string element idx from a packed list.
vec | a pointer to LIBMVL_VECTOR with type LIBMVL_PACKED_LIST64 |
data | a pointer to beginning of memory mapped MVL file |
idx | entry index |
|
inlinestatic |
|
inlinestatic |
LIBMVL_NAMED_LIST* mvl_read_attributes_list | ( | LIBMVL_CONTEXT * | ctx, |
const void * | data, | ||
LIBMVL_OFFSET64 | data_size, | ||
LIBMVL_OFFSET64 | metadata_offset | ||
) |
Read back MVL attributes list, typically used to described metadata. This function also initialize hash table for fast access. This function does not check that the offsets stored in returned LIBMVL_NAMED_LIST data structure are valid, this should be done by the code that uses those offsets.
ctx | MVL context pointer |
data | memory mapped data. If data is NULL then this function will use base address from context initialized by mvl_load_image() |
data_size | size of memory mapped data. If data is NULL then this function will use data_size from context initialized by mvl_load_image() |
metadata_offset | metadata offset pointing to the previously written attributes |
LIBMVL_NAMED_LIST* mvl_read_named_list | ( | LIBMVL_CONTEXT * | ctx, |
const void * | data, | ||
LIBMVL_OFFSET64 | data_size, | ||
LIBMVL_OFFSET64 | offset | ||
) |
Read back MVL named list. This function also initialize hash table for fast access.
ctx | MVL context pointer |
data | memory mapped data. If data is NULL then this function will use base address from context initialized by mvl_load_image() |
data_size | size of memory mapped data. ggIf data is NULL then this function will use data_size from context initialized by mvl_load_image() |
offset | offset into data where LIBMVL_NAMED_LIST begins |
void mvl_recompute_named_list_hash | ( | LIBMVL_NAMED_LIST * | L | ) |
Recompute named list hash.
L | pointer to previously allocated LIBMVL_NAMED_LIST |
void mvl_rewrite_vector | ( | LIBMVL_CONTEXT * | ctx, |
int | type, | ||
LIBMVL_OFFSET64 | base_offset, | ||
LIBMVL_OFFSET64 | idx, | ||
long | length, | ||
const void * | data | ||
) |
Write more data to MVL vector that has been previously created with mvl_start_write_vector()
ctx | MVL context pointer that has been initialized for writing |
type | MVL data type |
base_offset | the offset returned by mvl_start_write_vector() |
idx | index of of first element pointed to by data |
length | number of elements to write |
data | pointer to data |
int mvl_sort_indices | ( | LIBMVL_OFFSET64 | indices_count, |
LIBMVL_OFFSET64 * | indices, | ||
LIBMVL_OFFSET64 | vec_count, | ||
LIBMVL_VECTOR ** | vec, | ||
void ** | vec_data, | ||
int | sort_function | ||
) |
Given a table-like set of vectors of equal length arrange indices so that the columns are sorted lexicographically.
indices_count | total number of indices |
indices | an array of indices into provided vectors |
vec_count | the number of LIBMVL_VECTORS considered as columns in a table |
vec | an array of pointers to LIBMVL_VECTORS considered as columns in a table |
vec_data | an array of pointers to memory mapped areas those LIBMVL_VECTORs derive from. This allows computing hash from vectors drawn from different MVL files |
sort_function | one of LIBMVL_SORT_LEXICOGRAPHIC or LIBMVL_SORT_LEXICOGRAPHIC_DESC to specify sort direction |
Definition at line 354 of file libMVL_sort.cc.
LIBMVL_OFFSET64 mvl_start_write_vector | ( | LIBMVL_CONTEXT * | ctx, |
int | type, | ||
LIBMVL_OFFSET64 | expected_length, | ||
LIBMVL_OFFSET64 | length, | ||
const void * | data, | ||
LIBMVL_OFFSET64 | metadata | ||
) |
Begin write of MVL vector. This is only needed if the vector has to be written in parts, such as due to memory constraints.
ctx | MVL context pointer that has been initialized for writing |
type | MVL data type |
expected_length | number of elements in the fully written vector |
length | number of elements to write |
data | pointer to data |
metadata | an optional offset to previously written metadata. Specify LIBMVL_NO_METADATA if not needed |
const char* mvl_strerror | ( | LIBMVL_CONTEXT * | ctx | ) |
Obtain description of error code.
ctx | pointer to context previously allocated with mvl_create_context() |
|
inlinestatic |
This function returns 0 if the offset into data points to a valid vector, or a negative error code otherwise.
offset | an offset into memory mapped data where the LIBMVL_VECTOR is located |
data | pointer to beginning of memory mapped data |
data_size | an upper limit for valid offsets - usually the size of mapped MVL file. if data_size is set to ~0LLU the checks are bypassed |
|
inlinestatic |
A convenience version of mvl_validate_vector() that uses data and data_size from MVL context. This function returns 0 if the offset into data points to a valid vector, or a negative error code otherwise.
ctx | MVL context pointer |
offset | an offset into memory mapped data where the LIBMVL_VECTOR is located |
|
inlinestatic |
A convenience function to convert an offset into memory mapped data into a pointer to LIBMVL_VECTOR structure.
This function validates vector structure, but not the contents of the vector.
data | pointer to memory mapped MVL file |
offset | 64-bit offset into MVL file |
|
inlinestatic |
A convenience function to convert an offset into memory mapped data into a pointer to LIBMVL_VECTOR structure.
It assumes that the offset is valid, to validate it see mvl_validate_vector()
data | pointer to memory mapped MVL file |
offset | 64-bit offset into MVL file |
|
inlinestatic |
Return number of entries in the vector. Currently this is the same as mvl_vector_length() for all types except LIBMVL_PACKED_LIST64.
vec | pointer to start of the vector |
int mvl_verify_checksum_vector | ( | LIBMVL_CONTEXT * | ctx, |
const LIBMVL_VECTOR * | checksum_vector, | ||
void * | data, | ||
LIBMVL_OFFSET64 | data_size, | ||
LIBMVL_OFFSET64 | start, | ||
LIBMVL_OFFSET64 | stop | ||
) |
Compute and verify checksums for a given area.
ctx | MVL context pointer that has been initialized for reading |
checksum_vector | pointer to checksum vector. You can pass NULL to use full checksums. |
data | base address. Usually the base address of memory mapped MVL file. Must be properly aligned. A possible reason for alignment errors is mmap() returning MAP_FAILED. If data is NULL then this function will use base address from context initialized by mvl_load_image() |
data_size | size of data. If data is NULL then this function will use data_size from context initialized by mvl_load_image() |
start | byte offset of start of area to checksum. Must be greater or equal to the checksum_area_start field of checksum_vector |
stop | byte offset of first entry past the end of checksummed area. Must be greater or equal to start. Must be less or equal to the checksum_area_stop field of checksum_vector |
int mvl_verify_checksum_vector2 | ( | LIBMVL_CONTEXT * | ctx, |
const LIBMVL_VECTOR * | checksum_vector, | ||
void * | data, | ||
LIBMVL_OFFSET64 | data_size, | ||
LIBMVL_OFFSET64 | vector_offset | ||
) |
Compute and verify checksums for the entire area occupied by given LIBMVL_VECTOR. Metadata is not checked.
ctx | MVL context pointer that has been initialized for reading |
checksum_vector | pointer to checksum vector. You can pass NULL to use full checksums. |
data | base address. Usually the base address of memory mapped MVL file. Must be properly aligned. A possible reason for alignment errors is mmap() returning MAP_FAILED. If data is NULL then this function will use base address from context initialized by mvl_load_image() |
data_size | size of data. If data is NULL then this function will use data_size from context initialized by mvl_load_image() |
vector_offset | offset from base pointing to valid LIBMVL_VECTOR |
int mvl_verify_checksum_vector3 | ( | LIBMVL_CONTEXT * | ctx, |
const LIBMVL_VECTOR * | checksum_vector, | ||
void * | data, | ||
LIBMVL_OFFSET64 | data_size, | ||
void * | start, | ||
void * | stop | ||
) |
Compute and verify checksums for a given area. It works just like mvl_verify_checksum_vector() but takes pointers instead of offsets.
ctx | MVL context pointer that has been initialized for reading |
checksum_vector | pointer to checksum vector. You can pass NULL to use full checksums. |
data | base address. Usually the base address of memory mapped MVL file. Must be properly aligned. A possible reason for alignment errors is mmap() returning MAP_FAILED. If data is NULL then this function will use base address from context initialized by mvl_load_image() |
data_size | size of data. If data is NULL then this function will use data_size from context initialized by mvl_load_image() |
start | pointer to start of area to checksum. |
stop | pointer to first entry past the end of checksummed area. |
int mvl_verify_full_checksum_vector | ( | LIBMVL_CONTEXT * | ctx, |
const LIBMVL_VECTOR * | checksum_vector, | ||
void * | data, | ||
LIBMVL_OFFSET64 | data_size | ||
) |
Compute and verify checksums for the entire area covered by checksum vector.
ctx | MVL context pointer that has been initialized for reading |
checksum_vector | pointer to checksum vector. You can pass NULL to use full checksums. |
data | base address. Usually the base address of memory mapped MVL file. Must be properly aligned. A possible reason for alignment errors is mmap() returning MAP_FAILED. If data is NULL then this function will use base address from context initialized by mvl_load_image() |
data_size | size of data. If data is NULL then this function will use data_size from context initialized by mvl_load_image() |
LIBMVL_OFFSET64 mvl_write_attributes_list | ( | LIBMVL_CONTEXT * | ctx, |
LIBMVL_NAMED_LIST * | L | ||
) |
LIBMVL_OFFSET64 mvl_write_cached_string | ( | LIBMVL_CONTEXT * | ctx, |
long | length, | ||
const char * | data | ||
) |
Write a single C string if it has not been written before, otherwise return offset to previously written object. In particular, this is handy for providing metadata tags.
ctx | MVL context pointer that has been initialized for writing |
length | string length. Set to -1 to be computed automatically. |
data | string data |
LIBMVL_OFFSET64 mvl_write_concat_vectors | ( | LIBMVL_CONTEXT * | ctx, |
int | type, | ||
long | nvec, | ||
const long * | lengths, | ||
void ** | data, | ||
LIBMVL_OFFSET64 | metadata | ||
) |
Write complete MVL vector concatenating data from many vectors or arrays.
ctx | MVL context pointer that has been initialized for writing |
type | MVL data type |
nvec | number of arrays to concatenate |
lengths | array of lengths of individual vectors |
data | array of pointers to vector data |
metadata | an optional offset to previously written metadata. Specify LIBMVL_NO_METADATA if not needed |
LIBMVL_OFFSET64 mvl_write_directory | ( | LIBMVL_CONTEXT * | ctx | ) |
Write out MVL file directory with entries collected so far. If this is called multiple times only the latest written directory is retrieved when MVL file is opened. It is an error to write out an empty directory.
ctx | MVL context pointer that has been initialized for writing |
LIBMVL_OFFSET64 mvl_write_extent_index | ( | LIBMVL_CONTEXT * | ctx, |
LIBMVL_EXTENT_INDEX * | ei | ||
) |
LIBMVL_OFFSET64 mvl_write_hash64_checksum_vector | ( | LIBMVL_CONTEXT * | ctx, |
void * | data, | ||
LIBMVL_OFFSET64 | checksum_area_start, | ||
LIBMVL_OFFSET64 | checksum_area_stop, | ||
LIBMVL_OFFSET64 | checksum_block_size | ||
) |
Compute and write checksums for a given area.
ctx | MVL context pointer that has been initialized for writing |
data | base address. Usually the base address of memory mapped MVL file. Must be properly aligned. A possible reason for alignment errors is mmap() returning MAP_FAILED. If data is NULL then this function will use base address from context initialized by mvl_load_image() |
checksum_area_start | byte offset of start of area to checksum. Set to 0 to checksum from beginning of MVL file. Must be multiple of 8. |
checksum_area_stop | byte offset of first entry past the end of checksummed area. Must be multiple of 8. |
checksum_block_size | byte size of checksum blocks. Must be multiple of 8. |
LIBMVL_OFFSET64 mvl_write_named_list | ( | LIBMVL_CONTEXT * | ctx, |
LIBMVL_NAMED_LIST * | L | ||
) |
Write out named list. In R, this would be read back as list.
ctx | MVL context pointer that has been initialized for writing |
L | previously created named list |
LIBMVL_OFFSET64 mvl_write_named_list2 | ( | LIBMVL_CONTEXT * | ctx, |
LIBMVL_NAMED_LIST * | L, | ||
char * | cl | ||
) |
Write out named list. In R, this would be read back as list with class attribute set to "cl".
ctx | MVL context pointer that has been initialized for writing |
L | previously created named list |
cl | character string describing list class |
LIBMVL_OFFSET64 mvl_write_named_list_as_data_frame | ( | LIBMVL_CONTEXT * | ctx, |
LIBMVL_NAMED_LIST * | L, | ||
int | nrows, | ||
LIBMVL_OFFSET64 | rownames | ||
) |
Write out named list in the style of R data frames. It is assumed that all entries of L are vectors with the same number of elements.
ctx | MVL context pointer that has been initialized for writing |
L | previously created named list |
nrows | number of elements in each entry of L. Note that packed lists should have length of nrows+1 |
rownames | names of individual rows. Set to 0 to omit. |
LIBMVL_OFFSET64 mvl_write_packed_list | ( | LIBMVL_CONTEXT * | ctx, |
long | count, | ||
const long * | str_size, | ||
unsigned char ** | str, | ||
LIBMVL_OFFSET64 | metadata | ||
) |
Write an array of strings as a packed list data type. This is convenient for storing a lot of different strings.
ctx | MVL context pointer that has been initialized for writing |
count | Number of strings to store |
str_size | array of lengths of individual strings. If this is NULL string lengths are computed automatically. In addition, if any string length is -1 it is also computed automatically. |
str | point to array of strings |
metadata | an optional offset to previously written metadata. Specify LIBMVL_NO_METADATA if not needed |
LIBMVL_OFFSET64 mvl_write_string | ( | LIBMVL_CONTEXT * | ctx, |
long | length, | ||
const char * | data, | ||
LIBMVL_OFFSET64 | metadata | ||
) |
Write a single C string. In particular, this is handy for providing metadata tags.
ctx | MVL context pointer that has been initialized for writing |
length | string length. Set to -1 to be computed automatically. |
data | string data |
metadata | an optional offset to previously written metadata. Specify LIBMVL_NO_METADATA if not needed |
LIBMVL_OFFSET64 mvl_write_vector | ( | LIBMVL_CONTEXT * | ctx, |
int | type, | ||
LIBMVL_OFFSET64 | length, | ||
const void * | data, | ||
LIBMVL_OFFSET64 | metadata | ||
) |
Write complete MVL vector.
ctx | MVL context pointer that has been initialized for writing |
type | MVL data type |
length | number of elements to write |
data | pointer to data |
metadata | an optional offset to previously written metadata. Specify LIBMVL_NO_METADATA if not needed |