Skip to content

Commit dc55f0f

Browse files
committed
gh-114953: Support for multithreaded XZ compression in lzma.py
Signed-off-by: Meir Komet <mskomet1@gmail.com>
1 parent 28bb296 commit dc55f0f

File tree

4 files changed

+97
-17
lines changed

4 files changed

+97
-17
lines changed

Lib/lzma.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,8 @@ class LZMAFile(_compression.BaseStream):
4747
"""
4848

4949
def __init__(self, filename=None, mode="r", *,
50-
format=None, check=-1, preset=None, filters=None):
50+
format=None, check=-1, preset=None, filters=None,
51+
threads=1):
5152
"""Open an LZMA-compressed file in binary mode.
5253
5354
filename can be either an actual file name (given as a str,
@@ -72,7 +73,8 @@ def __init__(self, filename=None, mode="r", *,
7273
When opening a file for reading, the *preset* argument is not
7374
meaningful, and should be omitted. The *filters* argument should
7475
also be omitted, except when format is FORMAT_RAW (in which case
75-
it is required).
76+
it is required). The *threads* argument is only useful when writing
77+
using FORMAT_XZ.
7678
7779
When opening a file for writing, the settings used by the
7880
compressor can be specified either as a preset compression
@@ -89,6 +91,11 @@ def __init__(self, filename=None, mode="r", *,
8991
filters (if provided) should be a sequence of dicts. Each dict
9092
should have an entry for "id" indicating ID of the filter, plus
9193
additional entries for options to the filter.
94+
95+
threads (if provided) should be a non-negative integer indicating how
96+
many background threads to create for the compressor (only when using
97+
FORMAT_XZ, otherwise the compression will be single-threaded).
98+
If 0, the number of threads is set to the number of CPU cores.
9299
"""
93100
self._fp = None
94101
self._closefp = False
@@ -109,7 +116,8 @@ def __init__(self, filename=None, mode="r", *,
109116
format = FORMAT_XZ
110117
mode_code = _MODE_WRITE
111118
self._compressor = LZMACompressor(format=format, check=check,
112-
preset=preset, filters=filters)
119+
preset=preset, filters=filters,
120+
threads=threads)
113121
self._pos = 0
114122
else:
115123
raise ValueError("Invalid mode: {!r}".format(mode))
@@ -316,15 +324,15 @@ def open(filename, mode="rb", *,
316324
return binary_file
317325

318326

319-
def compress(data, format=FORMAT_XZ, check=-1, preset=None, filters=None):
327+
def compress(data, format=FORMAT_XZ, check=-1, preset=None, filters=None, threads=1):
320328
"""Compress a block of data.
321329
322330
Refer to LZMACompressor's docstring for a description of the
323331
optional arguments *format*, *check*, *preset* and *filters*.
324332
325333
For incremental compression, use an LZMACompressor instead.
326334
"""
327-
comp = LZMACompressor(format, check, preset, filters)
335+
comp = LZMACompressor(format, check, preset, filters, threads)
328336
return comp.compress(data) + comp.flush()
329337

330338

Lib/test/test_lzma.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,18 @@ def test_roundtrip_xz(self):
282282
lzd = LZMADecompressor()
283283
self._test_decompressor(lzd, cdata, lzma.CHECK_CRC64)
284284

285+
def test_roundtrip_xz_mt(self):
286+
lzc = LZMACompressor(threads=0)
287+
cdata = lzc.compress(INPUT) + lzc.flush()
288+
lzd = LZMADecompressor()
289+
self._test_decompressor(lzd, cdata, lzma.CHECK_CRC64)
290+
291+
def test_roundtrip_xz_mt_preset_6(self):
292+
lzc = LZMACompressor(preset=6, threads=8)
293+
cdata = lzc.compress(INPUT) + lzc.flush()
294+
lzd = LZMADecompressor()
295+
self._test_decompressor(lzd, cdata, lzma.CHECK_CRC64)
296+
285297
def test_roundtrip_alone(self):
286298
lzc = LZMACompressor(lzma.FORMAT_ALONE)
287299
cdata = lzc.compress(INPUT) + lzc.flush()
@@ -540,6 +552,8 @@ class FileTestCase(unittest.TestCase):
540552
def test_init(self):
541553
with LZMAFile(BytesIO(COMPRESSED_XZ)) as f:
542554
pass
555+
with LZMAFile(BytesIO(COMPRESSED_XZ), threads=4) as f:
556+
pass
543557
with LZMAFile(BytesIO(), "w") as f:
544558
pass
545559
with LZMAFile(BytesIO(), "x") as f:
@@ -657,6 +671,8 @@ def test_init_bad_filter_spec(self):
657671
LZMAFile(BytesIO(), "w", filters=[b"wobsite"])
658672
with self.assertRaises(ValueError):
659673
LZMAFile(BytesIO(), "w", filters=[{"xyzzy": 3}])
674+
with self.assertRaises(ValueError):
675+
LZMAFile(BytesIO(), "w", filters=[{"xyzzy": 3}], threads=4)
660676
with self.assertRaises(ValueError):
661677
LZMAFile(BytesIO(), "w", filters=[{"id": 98765}])
662678
with self.assertRaises(ValueError):
@@ -669,6 +685,10 @@ def test_init_bad_filter_spec(self):
669685
LZMAFile(BytesIO(), "w",
670686
filters=[{"id": lzma.FILTER_X86, "foo": 0}])
671687

688+
def test_init_bad_threads(self):
689+
with self.assertRaises(ValueError):
690+
LZMAFile(BytesIO(), "w", threads=-1)
691+
672692
def test_init_with_preset_and_filters(self):
673693
with self.assertRaises(ValueError):
674694
LZMAFile(BytesIO(), "w", format=lzma.FORMAT_RAW,
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
Added support for multi-threaded XZ compression by passing in new optional
2+
parameter ``threads`` into :mod:`lzma` API. Supported both in the
3+
:func:`lzma.compress()` function and :class:`lzma.LZMAFile` class.
4+
5+
This is supported by using the underlying ``lzma_stream_encoder_mt()`` API
6+
provided by liblzma.

Modules/_lzmamodule.c

Lines changed: 58 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -650,20 +650,60 @@ _lzma_LZMACompressor_flush_impl(Compressor *self)
650650

651651
static int
652652
Compressor_init_xz(_lzma_state *state, lzma_stream *lzs,
653-
int check, uint32_t preset, PyObject *filterspecs)
653+
int check, uint32_t preset, PyObject *filterspecs,
654+
int threads)
654655
{
655656
lzma_ret lzret;
656657

657-
if (filterspecs == Py_None) {
658-
lzret = lzma_easy_encoder(lzs, preset, check);
659-
} else {
660-
lzma_filter filters[LZMA_FILTERS_MAX + 1];
658+
if (threads == 1) {
659+
if (filterspecs == Py_None) {
660+
lzret = lzma_easy_encoder(lzs, preset, check);
661+
} else {
662+
lzma_filter filters[LZMA_FILTERS_MAX + 1];
661663

662-
if (parse_filter_chain_spec(state, filters, filterspecs) == -1)
664+
if (parse_filter_chain_spec(state, filters, filterspecs) == -1)
665+
return -1;
666+
lzret = lzma_stream_encoder(lzs, filters, check);
667+
free_filter_chain(filters);
668+
}
669+
} else {
670+
if (threads < 0) {
671+
PyErr_SetString(PyExc_ValueError,
672+
"threads must be a non-negative integer");
663673
return -1;
664-
lzret = lzma_stream_encoder(lzs, filters, check);
665-
free_filter_chain(filters);
674+
}
675+
676+
if (threads == 0) {
677+
threads = lzma_cputhreads();
678+
}
679+
if (threads == 0) {
680+
threads = 1;
681+
}
682+
683+
lzma_mt mt = {
684+
.flags = 0,
685+
.block_size = 0,
686+
.timeout = 0,
687+
.preset = preset,
688+
.check = check,
689+
.filters = NULL,
690+
.threads = threads,
691+
};
692+
693+
if (filterspecs == Py_None) {
694+
lzret = lzma_stream_encoder_mt(lzs, &mt);
695+
} else {
696+
lzma_filter filters[LZMA_FILTERS_MAX + 1];
697+
698+
if (parse_filter_chain_spec(state, filters, filterspecs) == -1)
699+
return -1;
700+
701+
mt.filters = filters;
702+
lzret = lzma_stream_encoder_mt(lzs, &mt);
703+
free_filter_chain(filters);
704+
}
666705
}
706+
667707
if (catch_lzma_error(state, lzret)) {
668708
return -1;
669709
}
@@ -755,6 +795,9 @@ _lzma.LZMACompressor.__new__
755795
have an entry for "id" indicating the ID of the filter, plus
756796
additional entries for options to the filter.
757797
798+
threads: int(c_default="1") = 1
799+
Number of threads to use for compression (only relevant with FORMAT_XZ).
800+
758801
Create a compressor object for compressing data incrementally.
759802
760803
The settings used by the compressor can be specified either as a
@@ -769,9 +812,11 @@ For one-shot compression, use the compress() function instead.
769812
static PyObject *
770813
Compressor_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
771814
{
772-
static char *arg_names[] = {"format", "check", "preset", "filters", NULL};
815+
static char *arg_names[] = {"format", "check", "preset", "filters",
816+
"threads", NULL};
773817
int format = FORMAT_XZ;
774818
int check = -1;
819+
int threads = 1;
775820
uint32_t preset = LZMA_PRESET_DEFAULT;
776821
PyObject *preset_obj = Py_None;
777822
PyObject *filterspecs = Py_None;
@@ -780,9 +825,9 @@ Compressor_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
780825
_lzma_state *state = PyType_GetModuleState(type);
781826
assert(state != NULL);
782827
if (!PyArg_ParseTupleAndKeywords(args, kwargs,
783-
"|iiOO:LZMACompressor", arg_names,
828+
"|iiOOi:LZMACompressor", arg_names,
784829
&format, &check, &preset_obj,
785-
&filterspecs)) {
830+
&filterspecs, &threads)) {
786831
return NULL;
787832
}
788833

@@ -826,7 +871,8 @@ Compressor_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
826871
if (check == -1) {
827872
check = LZMA_CHECK_CRC64;
828873
}
829-
if (Compressor_init_xz(state, &self->lzs, check, preset, filterspecs) != 0) {
874+
if (Compressor_init_xz(state, &self->lzs, check, preset,
875+
filterspecs, threads) != 0) {
830876
goto error;
831877
}
832878
break;

0 commit comments

Comments
 (0)