Commit 00e189e6 authored by Leo Pound Singer's avatar Leo Pound Singer
Browse files

Add optional Zstandard compression

Zstandard provides much faster compression and decompression than
gzip at any compression ratio. The "zstandard" Python package is
an optional dependency to enable this new feature.

I tried running `ligolw_sqlite` on a recent gstlal coinc database
to convert it to XML. The uncompressed XML file took 32 seconds to
write and was 300M in size. The gzip-compressed XML file took 52
seconds and was 98M. The Zstandard-compressed XML file took 35
seconds and was 87M in size.

In short, Zstandard compression will give comparable file sizes
to gzip but with compression/decompression time that is negligible
compared to LIGO-LW processing itself.
parent 908f2a1f
Pipeline #117862 passed with stages
in 2 minutes and 3 seconds
......@@ -135,7 +135,8 @@ ligolw_utils.write_filename(
xmldoc,
options.output,
verbose = options.verbose,
gz = (options.output or "stdout").endswith(".gz")
gz = (options.output or "stdout").endswith(".gz"),
zst = (options.output or "stdout").endswith(".zst")
)
......
......@@ -193,5 +193,5 @@ for n, filename in enumerate(filenames, 1):
RemoveColumns(doc, options.delete_column)
if options.delete_element:
RemoveElements(doc, options.delete_element)
ligolw_utils.write_filename(doc, filename, verbose = options.verbose, gz = (filename or "stdout").endswith(".gz"))
ligolw_utils.write_filename(doc, filename, verbose = options.verbose, gz = (filename or "stdout").endswith(".gz"), zst = (filename or "stdout").endswith(".zst"))
doc.unlink()
......@@ -87,5 +87,5 @@ for filename in filenames or [None]:
ilwd.strip_ilwdchar(xmldoc)
ligolw_utils.write_filename(xmldoc, filename, gz = (filename or "stdout").endswith(".gz"), verbose = options.verbose)
ligolw_utils.write_filename(xmldoc, filename, gz = (filename or "stdout").endswith(".gz"), zst = (filename or "stdout").endswith(".zst"), verbose = options.verbose)
xmldoc.unlink()
......@@ -218,8 +218,8 @@ for url in urls:
if not options.output:
ligolw_utils.write_url(xmldoc, url, verbose = options.verbose, gz = (url or "stdout").endswith(".gz"))
ligolw_utils.write_url(xmldoc, url, verbose = options.verbose, gz = (url or "stdout").endswith(".gz"), zst = (url or "stdout").endswith(".zst"))
elif options.output == "-":
ligolw_utils.write_filename(xmldoc, None, verbose = options.verbose)
else:
ligolw_utils.write_filename(xmldoc, options.output, verbose = options.verbose, gz = options.output.endswith(".gz"))
ligolw_utils.write_filename(xmldoc, options.output, verbose = options.verbose, gz = options.output.endswith(".gz"), zst = options.output.endswith(".zst"))
......@@ -39,6 +39,18 @@ import signal
import stat
import sys
NO_ZSTD_ERROR_MESSAGE = '''\
For zstd compression, you must install the "zstandard" Python package:
pip install zstandard'''
try:
import zstd
except ImportError:
class NoZstd(object):
def __getattr__(self, *args):
raise ImportError(NO_ZSTD_ERROR_MESSAGE)
zstd = NoZstd()
from .. import __author__, __date__, __version__
from .. import ligolw
......@@ -309,7 +321,7 @@ class SignalsTrap(object):
return False
def load_fileobj(fileobj, gz = None, xmldoc = None, contenthandler = None):
def load_fileobj(fileobj, gz = None, zst = None, xmldoc = None, contenthandler = None):
"""
Parse the contents of the file object fileobj, and return the
contents as a LIGO Light Weight document tree. The file object
......@@ -342,12 +354,20 @@ def load_fileobj(fileobj, gz = None, xmldoc = None, contenthandler = None):
"""
if contenthandler is None:
raise ValueError("missing required keyword argument \"contenthandler\"")
if gz or gz is None:
if gz and zst:
raise ValueError('Cannot set both gz=True and zst=True')
elif gz is None and zst is None:
fileobj = RewindableInputFile(fileobj)
magic = fileobj.read(2)
magic = fileobj.read(4)
fileobj.seek(0, os.SEEK_SET)
if gz or magic == b'\037\213':
fileobj = gzip.GzipFile(mode = "rb", fileobj = fileobj)
if magic[:2] == b'\037\213':
gz = True
elif magic == b'\x28\xB5\x2F\xFD':
zst = True
if gz:
fileobj = gzip.GzipFile(mode = "rb", fileobj = fileobj)
elif zst:
fileobj = zstd.ZstdDecompressor().stream_reader(fileobj)
if xmldoc is None:
xmldoc = ligolw.Document()
ligolw.make_parser(contenthandler(xmldoc)).parse(fileobj)
......@@ -410,7 +430,7 @@ def load_url(url, verbose = False, **kwargs):
return load_fileobj(fileobj, **kwargs)
def write_fileobj(xmldoc, fileobj, gz = False, compresslevel = 3, **kwargs):
def write_fileobj(xmldoc, fileobj, gz = False, zst = False, compresslevel = 3, **kwargs):
"""
Writes the LIGO Light Weight document tree rooted at xmldoc to the
given file object. Internally, the .write() method of the xmldoc
......@@ -439,8 +459,12 @@ def write_fileobj(xmldoc, fileobj, gz = False, compresslevel = 3, **kwargs):
</LIGO_LW>
"""
with NoCloseFlushWrapper(fileobj) as fileobj:
if gz:
if gz and zst:
raise ValueError('Cannot set both gz=True and zst=True')
elif gz:
fileobj = gzip.GzipFile(mode = "wb", fileobj = fileobj, compresslevel = compresslevel)
elif zst:
fileobj = zstd.ZstdCompressor(level = compresslevel).stream_writer(fileobj)
with codecs.getwriter("utf_8")(fileobj) as fileobj:
xmldoc.write(fileobj, **kwargs)
......@@ -475,7 +499,7 @@ class tildefile(object):
return False
def write_filename(xmldoc, filename, verbose = False, gz = False, with_mv = True, trap_signals = SignalsTrap.default_signals, **kwargs):
def write_filename(xmldoc, filename, verbose = False, gz = False, zst = False, with_mv = True, trap_signals = SignalsTrap.default_signals, **kwargs):
"""
Writes the LIGO Light Weight document tree rooted at xmldoc to the
file name filename. If filename is None the file is written to
......@@ -521,9 +545,11 @@ def write_filename(xmldoc, filename, verbose = False, gz = False, with_mv = True
else:
if not gz and filename.endswith(".gz"):
warnings.warn("filename '%s' ends in '.gz' but file is not being gzip-compressed" % filename, UserWarning)
if not zst and filename.endswith(".zst"):
warnings.warn("filename '%s' ends in '.zst' but file is not being zstandard-compressed" % filename, UserWarning)
binary_open = lambda filename: open(filename, 'wb')
with (binary_open if not with_mv else tildefile)(filename) as fileobj:
write_fileobj(xmldoc, fileobj, gz = gz, **kwargs)
write_fileobj(xmldoc, fileobj, gz = gz, zst = zst, **kwargs)
def write_url(xmldoc, url, **kwargs):
......
......@@ -301,7 +301,7 @@ def extract(connection, filename, table_names = None, verbose = False, xsl_file
"""
xmldoc = ligolw.Document()
xmldoc.appendChild(dbtables.get_xml(connection, table_names))
ligolw_utils.write_filename(xmldoc, filename, gz = (filename or "stdout").endswith(".gz"), verbose = verbose, xsl_file = xsl_file)
ligolw_utils.write_filename(xmldoc, filename, gz = (filename or "stdout").endswith(".gz"), zst = (filename or "stdout").endswith(".zst"), verbose = verbose, xsl_file = xsl_file)
# delete cursors
xmldoc.unlink()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment