Skip to content

Commit f1593cc

Browse files
committed
Move C code for json+struct codec to an example in the docs
1 parent afaf3b9 commit f1593cc

File tree

4 files changed

+284
-1
lines changed

4 files changed

+284
-1
lines changed

c/examples/json_struct_metadata.c

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
#include <stdio.h>
2+
#include <stdlib.h>
3+
#include <err.h>
4+
#include <string.h>
5+
#include <tskit.h>
6+
7+
// these are properties of the ``json+struct`` codec, documented in tskit
8+
#define JSON_STRUCT_HEADER_SIZE 21
9+
10+
const uint8_t json_struct_codec_magic[4] = { 'J', 'B', 'L', 'B' };
11+
const uint8_t json_struct_codec_version = 1;
12+
13+
// little-endian read of a uint64_t from an address
14+
static uint64_t
15+
load_u64_le(const uint8_t *p)
16+
{
17+
uint64_t value = (uint64_t) p[0];
18+
value |= (uint64_t) p[1] << 8;
19+
value |= (uint64_t) p[2] << 16;
20+
value |= (uint64_t) p[3] << 24;
21+
value |= (uint64_t) p[4] << 32;
22+
value |= (uint64_t) p[5] << 40;
23+
value |= (uint64_t) p[6] << 48;
24+
value |= (uint64_t) p[7] << 56;
25+
return value;
26+
}
27+
28+
// little-endian write of a uint64_t to an address
29+
static void
30+
set_u64_le(uint8_t *dest, uint64_t value)
31+
{
32+
dest[0] = (uint8_t) (value & 0xFF);
33+
dest[1] = (uint8_t) ((value >> 8) & 0xFF);
34+
dest[2] = (uint8_t) ((value >> 16) & 0xFF);
35+
dest[3] = (uint8_t) ((value >> 24) & 0xFF);
36+
dest[4] = (uint8_t) ((value >> 32) & 0xFF);
37+
dest[5] = (uint8_t) ((value >> 40) & 0xFF);
38+
dest[6] = (uint8_t) ((value >> 48) & 0xFF);
39+
dest[7] = (uint8_t) ((value >> 56) & 0xFF);
40+
}
41+
42+
// Extract the json and binary payloads from the `json+struct` codec data buffer.
43+
// Note that the output pointers `json` and `binary` reference memory
44+
// inside the `metadata` buffer passed in.
45+
void
46+
json_struct_codec_get_components(uint8_t *metadata, tsk_size_t metadata_length,
47+
uint8_t **json, tsk_size_t *json_length, uint8_t **binary, tsk_size_t *binary_length)
48+
{
49+
// check the structure of the codec header and the sizes it specifies
50+
if (metadata == NULL || json == NULL || json_length == NULL || binary == NULL
51+
|| binary_length == NULL)
52+
errx(EXIT_FAILURE, "bad parameter value.");
53+
if (metadata_length < JSON_STRUCT_HEADER_SIZE)
54+
errx(EXIT_FAILURE, "metadata truncated.");
55+
if (memcmp(metadata, json_struct_codec_magic, sizeof(json_struct_codec_magic)) != 0)
56+
errx(EXIT_FAILURE, "bad magic bytes.");
57+
58+
uint8_t version = metadata[4];
59+
if (version != json_struct_codec_version)
60+
errx(EXIT_FAILURE, "bad version number.");
61+
62+
uint64_t json_length_u64 = load_u64_le(metadata + 5);
63+
uint64_t binary_length_u64 = load_u64_le(metadata + 13);
64+
if (json_length_u64 > UINT64_MAX - (uint64_t) JSON_STRUCT_HEADER_SIZE)
65+
errx(EXIT_FAILURE, "invalid length.");
66+
67+
// determine the number of padding bytes and do more safety checks
68+
uint64_t length = (uint64_t) JSON_STRUCT_HEADER_SIZE + json_length_u64;
69+
uint64_t padding_length = (8 - (length & 0x07)) % 8;
70+
length += padding_length;
71+
if (binary_length_u64 > UINT64_MAX - length)
72+
errx(EXIT_FAILURE, "invalid length.");
73+
74+
length += binary_length_u64;
75+
if ((uint64_t) metadata_length != length)
76+
errx(EXIT_FAILURE, "unexpected size.");
77+
78+
uint8_t *padding_start = metadata + JSON_STRUCT_HEADER_SIZE + json_length_u64;
79+
for (uint64_t j = 0; j < padding_length; ++j)
80+
if (*(padding_start + j) != 0)
81+
errx(EXIT_FAILURE, "padding bytes are nonzero.");
82+
83+
// the structure of the codec data seems valid; return components
84+
*json = metadata + JSON_STRUCT_HEADER_SIZE;
85+
*json_length = (tsk_size_t) json_length_u64;
86+
87+
*binary = metadata + JSON_STRUCT_HEADER_SIZE + json_length_u64 + padding_length;
88+
*binary_length = (tsk_size_t) binary_length_u64;
89+
}
90+
91+
// malloc and return a data buffer for the `json+struct` codec
92+
// that contains the given components
93+
void
94+
json_struct_codec_create_buffer(const uint8_t *json, tsk_size_t json_length,
95+
const uint8_t *binary, tsk_size_t binary_length, uint8_t **buffer,
96+
tsk_size_t *buffer_length)
97+
{
98+
// figure out the total length of the codec's data and allocate the buffer for it
99+
tsk_size_t header_length = JSON_STRUCT_HEADER_SIZE;
100+
tsk_size_t padding_length = (8 - ((header_length + json_length) & 0x07)) % 8;
101+
tsk_size_t total_length
102+
= header_length + json_length + padding_length + binary_length;
103+
uint8_t *bytes = malloc(total_length);
104+
if (!bytes)
105+
errx(EXIT_FAILURE, "memory for buffer could not be allocated.");
106+
107+
// then set up the bytes for the codec header
108+
memcpy(bytes, json_struct_codec_magic, 4);
109+
bytes[4] = json_struct_codec_version;
110+
set_u64_le(bytes + 5, (uint64_t) json_length);
111+
set_u64_le(bytes + 13, (uint64_t) binary_length);
112+
113+
// copy in the JSON and binary data, separated by the padding bytes; the goal of the
114+
// padding bytes is to ensure that the binary data is 8-byte-aligned relative to the
115+
// start of the buffer
116+
memcpy(bytes + header_length, json, json_length);
117+
memset(bytes + header_length + json_length, 0, padding_length);
118+
memcpy(bytes + header_length + json_length + padding_length, binary, binary_length);
119+
120+
// return the buffer and its length; the caller takes ownership of the buffer
121+
*buffer = bytes;
122+
*buffer_length = total_length;
123+
}
124+
125+
int
126+
main(int argc, char **argv)
127+
{
128+
// we start with JSON and binary payloads that we encode into a new buffer
129+
// note that the JSON payload does not have to end with a trailing NULL
130+
const char json_payload[] = { '{', '"', 'a', '"', ':', '1', '}' };
131+
const uint8_t binary_payload[] = { 0x01, 0x02, 0x03, 0x04 };
132+
uint8_t *metadata;
133+
tsk_size_t metadata_length;
134+
135+
json_struct_codec_create_buffer((const uint8_t *) json_payload, sizeof(json_payload),
136+
binary_payload, sizeof(binary_payload), &metadata, &metadata_length);
137+
138+
// then we decode that buffer to recover the json and binary data
139+
uint8_t *decoded_json, *decoded_binary;
140+
tsk_size_t decoded_json_length, decoded_binary_length;
141+
142+
json_struct_codec_get_components(metadata, metadata_length, &decoded_json,
143+
&decoded_json_length, &decoded_binary, &decoded_binary_length);
144+
145+
// print the recovered data to demonstrate that the round-trip worked
146+
// note that the JSON data is not NULL-terminated unless you put a NULL there!
147+
printf("JSON: %.*s\n", (int) decoded_json_length, decoded_json);
148+
149+
printf("Binary data:");
150+
for (tsk_size_t j = 0; j < decoded_binary_length; j++)
151+
printf(" %#04x", decoded_binary[j]);
152+
printf("\n");
153+
154+
free(metadata);
155+
return EXIT_SUCCESS;
156+
}

c/meson.build

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,9 @@ if not meson.is_subproject()
125125
executable('multichrom_wright_fisher_singlethreaded',
126126
sources: ['examples/multichrom_wright_fisher_singlethreaded.c'],
127127
link_with: [tskit_lib], dependencies: lib_deps)
128+
executable('json_struct_metadata',
129+
sources: ['examples/json_struct_metadata.c'],
130+
link_with: [tskit_lib], dependencies: lib_deps)
128131

129132
thread_dep = dependency('threads')
130133
executable('multichrom_wright_fisher',

docs/c-api.rst

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -949,3 +949,60 @@ nodes need to be retained, and use
949949
.. literalinclude:: ../c/examples/multichrom_wright_fisher.c
950950
:language: c
951951

952+
----------------------------
953+
Reading and writing metadata
954+
----------------------------
955+
956+
The C API does not provide any functionality for manipulating
957+
the contents of metadata. For JSON metadata it is easy to
958+
parse metadata using an external JSON library, and for
959+
struct-encoded metadata the values can be directly unpacked.
960+
Examples of both can be found in
961+
`the SLiM code <https://messerlab.github.com/slim/>`_.
962+
963+
The :ref:`"json+struct" <sec_metadata_codecs_jsonstruct>`
964+
metadata codec is a little less straightforward to use,
965+
so we provide here an example of how to write to it
966+
and read from it in C. See :ref:`sec_metadata_codecs_jsonstruct`
967+
for details of how the metadata is encoded.
968+
(In Python, tskit automatically decodes both JSON and binary
969+
metadata and provides it as Python-data-typed metadata,
970+
just as for other codecs.)
971+
972+
The structure of this example is as follows:
973+
974+
1. Values specific to the metadata's header (e.g., the magic bytes `JBLB`).
975+
2. Functions that encode/decode `uint64_t`, used to store the lengths
976+
of the two components in the header.
977+
3. A method to "read" the metadata: really, to get pointers to the
978+
json and struct components.
979+
4. A method to "write" the metadata, again just given pointers to
980+
and lengths of the two components.
981+
5. The program itself just round-trips a very simple chunk of metadata,
982+
consisting of the JSON "`{"a": 1}`" and some binary `uint8_t` bytes ("`1234`").
983+
984+
.. literalinclude:: ../c/examples/json_struct_metadata.c
985+
:language: c
986+
987+
Much of the complexity of the code is careful error checking of the lengths.
988+
989+
Here ``json_struct_codec_get_components`` takes a pointer to binary metadata
990+
and returns pointers to *within that memory*.
991+
A different approach might have copied the two portions of the metadata
992+
into two buffers (to then be decoded, for instance).
993+
However, that would double the memory footprint,
994+
and since this codec is intended for large metadata,
995+
we did not use that approach in this example.
996+
997+
Along the same lines, it is worth noting that this example does make a copy of
998+
the JSON and binary data when writing, in ``json_struct_codec_create_buffer()``,
999+
which doubles the memory footprint at that point, and adds the
1000+
overhead of copying the data. A more efficient approach would be to calculate
1001+
the buffer length needed for the codec’s data, allocate the buffer with that
1002+
length, and then generate the necessary JSON and binary metadata directly into
1003+
that buffer. This would require the metadata-generating code to be more
1004+
closely entwined with the code for handling the json+struct codec header and
1005+
padding bytes, and so we have chosen not to adopt that approach here, for
1006+
pedagogical purposes; but if your use of this codec will involve large
1007+
metadata, such an approach is recommended.
1008+

docs/metadata.md

Lines changed: 68 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -527,7 +527,7 @@ of `B`, `H`, `I`, `L` or `Q` which have the same meaning as in the numeric
527527
types above. `L` is the default. As an example:
528528

529529
```
530-
{"type": "array", {"items": {"type":"number", "binaryFormat":"h"}}, "arrayLengthFormat":"B"}
530+
{"type": "array", "items": {"type":"number", "binaryFormat":"h"}, "arrayLengthFormat":"B"}
531531
```
532532

533533
Will result in an array of 2 byte integers, prepended by a single-byte array-length.
@@ -555,6 +555,73 @@ As a special case under the `struct` codec, the top-level type of metadata can b
555555
union of `object` and `null`. Set `"type": ["object", "null"]`. Properties should
556556
be defined as normal, and will be ignored if the metadata is `None`.
557557

558+
(sec_metadata_codecs_jsonstruct)=
559+
560+
### `json+struct`
561+
562+
An additional codec provides the ability to store *both* JSON and binary-encoded data.
563+
This is provided for the case where we want to store some arbitrary metadata
564+
(as JSON) along with a relatively large amount of data (as binary, for efficiency).
565+
For instance, we might want to record a raster map of the sampled area
566+
along with a few pieces of generic information (e.g., the name of the area).
567+
568+
The metadata schema for "json+struct" metadata basically just specifies both
569+
a JSON metadata schema and a struct metadata schema.
570+
Each entry in the metadata is encoded with either the JSON or the struct codec.
571+
Here is a simple example:
572+
573+
```{code-cell}
574+
schema = {
575+
"codec": "json+struct",
576+
"json": {
577+
"type": "object",
578+
"properties": {
579+
"label": {"type": "string"},
580+
"id": {"type": "number"},
581+
},
582+
"required": ["label"],
583+
},
584+
"struct": {
585+
"type": "object",
586+
"properties": {
587+
"values": {
588+
"type": "array",
589+
"arrayLengthFormat": "B",
590+
"items": {"type": "number", "binaryFormat": "i"},
591+
},
592+
},
593+
},
594+
}
595+
ms = tskit.MetadataSchema(schema)
596+
row = {"label": "alpha", "id": 7, "values": [5, 10, 2, 12]}
597+
encoded = ms.validate_and_encode_row(row)
598+
print("Encoded:", encoded)
599+
print("Decoded:", ms.decode_row(encoded))
600+
```
601+
602+
This encodes two things in JSON: a label and an ID number,
603+
and then an array of integers in binary (using the ``struct`` codec).
604+
If the array of integers is large, this could result in
605+
much better performance.
606+
607+
608+
#### Binary representation
609+
610+
The underlying structure of the JSON+struct codec is as follows.
611+
(If you're not writing out data in this format,
612+
you don't need to worry about this.)
613+
(1) some magic bytes;
614+
(2) a version number;
615+
(3) the length of the JSON in bytes;
616+
(4) the length of the binary (struct) data in bytes;
617+
(5) the JSON data;
618+
(6) zero-ed "padding" bytes to bring the start of the binary section
619+
into 8-byte alignment;
620+
(7) the binary data.
621+
The structure of the binary data is specified using the "struct" portion
622+
of the metadata schema.
623+
624+
558625
(sec_metadata_schema_examples)=
559626

560627
## Schema examples

0 commit comments

Comments
 (0)