-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathsmazmodule.c
127 lines (119 loc) · 5.22 KB
/
smazmodule.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#define PY_SSIZE_T_CLEAN
#include "./smaz/smaz.h"
#include <Python.h>
static PyObject *py_smaz_compress(PyObject *self, PyObject *args) {
char *uncompressed_input;
/*
* ":compress" leads to Python referencing this function correctly in the
* event of an error during PyArg_ParseTuple, like when passing a list instead
* of a string.
*/
if (!PyArg_ParseTuple(args, "s:compress", &uncompressed_input)) {
return NULL;
}
// 4KB is the default page size supported by many architectures. so lets
// allocating an entire memory page.
int output_buffer_size = 4096;
// output_buffer will hold our (raw) compressed string data.
char *output_buffer = (char *)malloc(output_buffer_size);
int compressed_size =
smaz_compress(uncompressed_input, strlen(uncompressed_input),
output_buffer, output_buffer_size);
/*
* If the compressed string didn't fit into the first 4096, we will get 4097
* returned. So, we continue increasing memory until the compressed output
* fits.
*/
while (output_buffer_size + 1 == compressed_size) {
/*
* increase memory usage.
* I dont feel like this is ideal as it might heavily over-allocate on
* bigger strings. But I do see that it is a good tradeoff between how many
* times we have to run this loop and memory usage.
*/
output_buffer_size *= 2;
output_buffer = (char *)realloc(output_buffer, output_buffer_size);
// try again. the while() on top then checks if it did fit this time around.
compressed_size =
smaz_compress(uncompressed_input, strlen(uncompressed_input),
output_buffer, output_buffer_size);
}
/*
* Yay. Compression done. Let's build python bytes out of that raw memory!
* No matter how big our buffer is, the "compressed_size" tells us where the
* actual data stops. That's where we mark the end.
*/
PyObject *py_bytes_object =
Py_BuildValue("y#", output_buffer, compressed_size);
free(output_buffer);
return py_bytes_object;
}
static PyObject *py_smaz_decompress(PyObject *self, PyObject *args) {
char *compressed_data = NULL;
Py_ssize_t compressed_data_size = 0;
/*
* ":decompress" leads to Python referencing this function correctly in the
* event of an error during PyArg_ParseTuple, like when passing a list instead
* of bytes.
*
* Note that we *have* to use "y#" because "y" does not allow for NULL bytes.
*/
if (!PyArg_ParseTuple(args, "y#:decompress", &compressed_data,
&compressed_data_size)) {
return NULL;
}
// 4KB is the default page size supported by many architectures. so lets
// allocating an entire memory page.
int output_buffer_size = 4096;
// output_buffer will hold our uncompressed string data.
char *output_buffer = (char *)malloc(output_buffer_size);
int decompressed_size = smaz_decompress(compressed_data, compressed_data_size,
output_buffer, output_buffer_size);
/* If the compressed string didn't fit into the first 4096, we will get 4097
* returned. So, we continue increasing memory until the compressed output
* fits.
*/
while (output_buffer_size + 1 == decompressed_size) {
/* increase memory usage.
* I dont feel like this is ideal as it might heavily over-allocate on
* bigger strings. But I do see that it is a good tradeoff between how many
* times we have to run this loop and memory usage.
*/
output_buffer_size *= 2;
output_buffer = (char *)malloc(output_buffer_size);
// try again. the while() on top then checks if it did fit this time around.
decompressed_size = smaz_decompress(compressed_data, compressed_data_size,
output_buffer, output_buffer_size);
}
/*
* Yay. Decompression done. Let's build a python string out of that raw
* memory! No matter how big our buffer is, the "decompressed_size" tells us
* where the actual data stops. That's where we mark the end for our string.
*/
output_buffer[decompressed_size] = 0;
PyObject *py_string_object = Py_BuildValue("s", output_buffer);
free(output_buffer);
return py_string_object;
}
// Which methods are exposed to the python world, including their docstrings.
static PyMethodDef SmazMethods[] = {
{"compress", py_smaz_compress, METH_VARARGS,
"Compresses a string using SMAZ compression.\n\nArgs:\n string: An "
"input string\nReturns:\n bytes: The input string compressed via SMAZ "
"compression."},
{"decompress", py_smaz_decompress, METH_VARARGS,
"Decompresses a SMAZ compressed string.\n\nArgs:\n bytes: A SMAZ "
"compressed input string\nReturns:\n string: The input string "
"decompressed via SMAZ decompression."},
{NULL, NULL, 0, NULL} /* Sentinel */
};
static struct PyModuleDef smazmodule = {
PyModuleDef_HEAD_INIT,
"smaz", // name of module
"String compression library using SMAZ", // module documentation, may be
// NULL
-1, /* size of per-interpreter state of the
module, or -1 if the module keeps
state in global variables. */
SmazMethods};
PyMODINIT_FUNC PyInit_smaz(void) { return PyModule_Create(&smazmodule); }