diff --git a/CHANGELOG.md b/CHANGELOG.md index f13e5f2..229f8fe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,12 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm --- +## [5.0.0-rc.3] - 2023-12-07 +### Added +- Integrity test for compressed files + +--- + ## [5.0.0-rc.2] - 2023-12-06 ### Added - FASTQ validator diff --git a/README.md b/README.md index d0c0f1b..bc5c8c1 100644 --- a/README.md +++ b/README.md @@ -83,6 +83,7 @@ options: Path to reference file for CRAM -p PROCESSES, --processes PROCESSES Number of processes to run in parallel when validating multiple files + -t, --test-integrity Whether to perform a full integrity test on compressed files ``` The tool will attempt to automatically detect the file type based on extension and perform the approriate validations. The tool will also perform an existence check along with a checksum check if an MD5 or SHA512 checksum exists regardless of file type. diff --git a/pipeval/__init__.py b/pipeval/__init__.py index 981705a..c09e17f 100644 --- a/pipeval/__init__.py +++ b/pipeval/__init__.py @@ -1,3 +1,3 @@ '''Inits pipeval module''' -__version__ = '5.0.0-rc.2' +__version__ = '5.0.0-rc.3' diff --git a/pipeval/validate/__main__.py b/pipeval/validate/__main__.py index 50288c6..11f67b8 100644 --- a/pipeval/validate/__main__.py +++ b/pipeval/validate/__main__.py @@ -28,5 +28,7 @@ def add_subparser_validate(subparsers:argparse._SubParsersAction): help='Path to reference file for CRAM') parser.add_argument('-p', '--processes', type=positive_integer, default=1, \ help='Number of processes to run in parallel when validating multiple files') + parser.add_argument('-t', '--test-integrity', action='store_true', \ + help='Whether to perform a full integrity test on compressed files') parser.set_defaults(func=run_validate) diff --git a/pipeval/validate/files.py b/pipeval/validate/files.py index 31eb5e7..9d249be 100644 --- a/pipeval/validate/files.py +++ b/pipeval/validate/files.py @@ -1,16 +1,52 @@ ''' File checking functions ''' from pathlib import Path +from typing import Union import warnings +import zlib +import gzip +import bz2 import magic -def _check_compressed(path:Path): - ''' Check file is compressed ''' - compression_mimes = [ - 'application/x-gzip', - 'application/x-bzip2' - ] - if magic.from_file(path.resolve(), mime=True) not in compression_mimes: +def _identify_compression(path:Path): + ''' Identify compression type and returns appropriate file handler ''' + compression_handlers = { + 'application/x-gzip': gzip.open, + 'application/x-bzip2': bz2.open + } + + file_mime = magic.from_file(path.resolve(), mime=True) + return compression_handlers.get(file_mime, None) + +def _check_compression_integrity(path:Path, handler:Union[gzip.open,bz2.open]): + ''' Verify integrity of compressed file ''' + integrity_error = '' + read_chunk_size = 1000000 # 1 MB chunks + + with handler(path, 'rb') as file_reader: + try: + while file_reader.read(read_chunk_size) != b'': + pass + except gzip.BadGzipFile as bad_gzip: + integrity_error = f'Invalid Gzip file: {bad_gzip}' + except EOFError as eof_error: + integrity_error = f'Truncated or corrupted file: {eof_error}' + except zlib.error as zlib_error: + integrity_error = f'Decompression error: {zlib_error}' + + if integrity_error != '': + raise TypeError(f'Compression integrity check failed: {integrity_error}') + +def _check_compressed(path:Path, test_integrity:bool): + ''' Check file compression ''' + + file_handler = _identify_compression(path) + + if file_handler is None: warnings.warn(f'Warning: file {path} is not compressed.') + return + + if test_integrity: + _check_compression_integrity(path, file_handler) def _path_exists(path:Path): ''' Check if path exists ''' diff --git a/pipeval/validate/validate.py b/pipeval/validate/validate.py index a3800c6..a54c1ad 100644 --- a/pipeval/validate/validate.py +++ b/pipeval/validate/validate.py @@ -54,7 +54,7 @@ def _validate_file( raise TypeError(f'File {path} does not have a valid extension.') if file_type in CHECK_COMPRESSION_TYPES: - _check_compressed(path) + _check_compressed(path, args.test_integrity) _validate_checksums(path) diff --git a/pipeval/validate/validate_types.py b/pipeval/validate/validate_types.py index ea52cbf..1c91e1d 100644 --- a/pipeval/validate/validate_types.py +++ b/pipeval/validate/validate_types.py @@ -3,5 +3,5 @@ ValidateArgs = namedtuple( 'args', - 'path, cram_reference, processes' + 'path, cram_reference, processes, test_integrity' ) diff --git a/test/unit/test_validate.py b/test/unit/test_validate.py index 07c6766..86a6128 100644 --- a/test/unit/test_validate.py +++ b/test/unit/test_validate.py @@ -4,6 +4,7 @@ from argparse import Namespace, ArgumentTypeError from unittest.mock import Mock, mock_open import warnings +import zlib import gzip import bz2 import mock @@ -11,7 +12,9 @@ from pipeval.validate.files import ( _check_compressed, - _path_exists + _path_exists, + _identify_compression, + _check_compression_integrity ) from pipeval.validate.validators.bam import ( _validate_bam_file, @@ -107,9 +110,10 @@ def test__path_exists__errors_for_non_existing_path(mock_path): @mock.patch('pipeval.validate.files.Path', autospec=True) def test__check_compressed__raises_warning_for_uncompressed_path(mock_path, mock_magic): mock_magic.return_value = 'text/plain' + test_args = ValidateArgs(path=[], cram_reference=None, processes=1, test_integrity=False) with pytest.warns(UserWarning): - _check_compressed(mock_path) + _check_compressed(mock_path, test_args) @pytest.mark.parametrize( 'compression_mime', @@ -118,14 +122,21 @@ def test__check_compressed__raises_warning_for_uncompressed_path(mock_path, mock ('application/x-bzip2') ] ) +@mock.patch('pipeval.validate.files._check_compression_integrity') @mock.patch('pipeval.validate.files.magic.from_file') @mock.patch('pipeval.validate.files.Path', autospec=True) -def test__check_compressed__passes_compression_check(mock_path, mock_magic, compression_mime): +def test__check_compressed__passes_compression_check( + mock_path, + mock_magic, + mock_integrity, + compression_mime): mock_magic.return_value = compression_mime + mock_integrity.return_value = None + test_args = ValidateArgs(path=[], cram_reference=None, processes=1, test_integrity=False) with warnings.catch_warnings(): warnings.filterwarnings("error") - _check_compressed(mock_path) + _check_compressed(mock_path, test_args) @mock.patch('pipeval.validate.validators.bam.pysam') def test__validate_bam_file__empty_bam_file(mock_pysam): @@ -236,7 +247,7 @@ def test__validate_vcf_file__passes_vcf_validation(mock_call): _validate_vcf_file('some/file') def test__run_validate__passes_validation_no_files(): - test_args = ValidateArgs(path=[], cram_reference=None, processes=1) + test_args = ValidateArgs(path=[], cram_reference=None, processes=1, test_integrity=False) run_validate(test_args) @pytest.mark.parametrize( @@ -259,7 +270,11 @@ def test___validation_worker__fails_with_failing_checks( mock_detect_file_type_and_extension, test_exception): test_path = 'some/path' - test_args = ValidateArgs(path=[test_path], cram_reference=None, processes=1) + test_args = ValidateArgs( + path=[test_path], + cram_reference=None, + processes=1, + test_integrity=False) mock_path_resolve.return_value = test_path mock_validate_file.side_effect = test_exception mock_detect_file_type_and_extension.return_value = ('', '') @@ -274,7 +289,11 @@ def test__run_validate__passes_on_all_valid_files( mock_path_resolve ): test_path = 'some/path' - test_args = ValidateArgs(path=[test_path], cram_reference=None, processes=1) + test_args = ValidateArgs( + path=[test_path], + cram_reference=None, + processes=1, + test_integrity=False) mock_path_resolve.return_value = None mock_pool.return_value.__enter__.return_value = Namespace(starmap=lambda y, z: [True]) @@ -287,7 +306,11 @@ def test__run_validate__fails_with_failing_file( mock_pool, mock_path_resolve): test_path = 'some/path' - test_args = ValidateArgs(path=[test_path], cram_reference=None, processes=1) + test_args = ValidateArgs( + path=[test_path], + cram_reference=None, + processes=1, + test_integrity=False) expected_code = 1 mock_path_resolve.return_value = None @@ -326,7 +349,13 @@ def test__validate_file__checks_compression( mock_path_exists.return_value = True mock_check_function_switch.return_value = {} - _validate_file('', test_file_types, 'ext', None) + test_args = ValidateArgs( + path=[], + cram_reference=None, + processes=1, + test_integrity=False) + + _validate_file('', test_file_types, 'ext', test_args) mock_check_compressed.assert_called_once() @@ -337,7 +366,11 @@ def test__run_validate__fails_on_unresolvable_symlink(mock_path_resolve): test_path = 'some/path' - test_args = ValidateArgs(path=[test_path], cram_reference=None, processes=1) + test_args = ValidateArgs( + path=[test_path], + cram_reference=None, + processes=1, + test_integrity=False) with pytest.raises(expected_error): run_validate(test_args) @@ -358,7 +391,11 @@ def test___validation_worker__passes_proper_validation( test_path = 'some/path' - test_args = ValidateArgs(path=[test_path], cram_reference=None, processes=1) + test_args = ValidateArgs( + path=[test_path], + cram_reference=None, + processes=1, + test_integrity=False) _validation_worker(test_path, test_args) @@ -480,3 +517,52 @@ def test__validate_fastq__passes_valid_fastq( with mock.patch("builtins.open", mock_open(read_data=test_data)) as mock_file: test_fastq = FASTQ(Path('test/path')) test_fastq.validate_fastq() + + +# pylint: disable=W0212 +@pytest.mark.parametrize( + 'test_file_type, test_handler', + [ + ('application/x-gzip', gzip.open), + ('application/x-bzip2', bz2.open), + ('any/other', None) + ] +) +@mock.patch('pipeval.validate.files.magic.from_file') +def test___identify_compression__identified_correct_handler( + mock_from_file, + test_file_type, + test_handler): + mock_from_file.return_value = test_file_type + + identifier_handler = _identify_compression(Path('test/path')) + + assert identifier_handler == test_handler + +@pytest.mark.parametrize( + 'test_handler', + [ + ("gzip.open"), + ("bz2.open") + ] +) +def test___check_compression_integrity__passes_valid_file(test_handler): + with mock.patch(test_handler, mock_open(read_data=b'data')) as mock_file: + _check_compression_integrity('test/path', mock_file) + +@pytest.mark.parametrize( + 'test_handler, test_exception', + [ + ("gzip.open", gzip.BadGzipFile), + ("gzip.open", EOFError), + ("gzip.open", zlib.error), + ("bz2.open", EOFError), + ("bz2.open", zlib.error) + ] +) +def test___check_compression_integrity__raises_on_exception(test_handler, test_exception): + with mock.patch(test_handler, mock_open(read_data=b'data')) as mock_file: + mock_file.return_value.read.side_effect = test_exception + + with pytest.raises(TypeError): + _check_compression_integrity('test/path', mock_file)