Support Audio feature for TAR archives in sequential access (huggingface#3129)

albertvillanova · web-flow · commit 07872f7454d8 · 2021-11-17T18:42:07.000+01:00
* Add test fixture for TAR WAV file * Add test iter_archive * Test dataset with Audio feature for TAR archive * Add Audio method to decode from bytes instead of path * Add Audio support for bytes besides path * Fix docstring * Remove archived attribute from test audio with TAR archive * Remove archived attribute from Audio feature * Implement Audio.encode_example * Call Audio.encode_example from encode_nested_example * Fix docs * Enhance Audio.decode_example to accept a string * Fix docs * Implement private Audio._storage_dtype to specify cached dtype * Change Audio._storage_dtype dynamically when encoding a string * Update test of Audio instantiation * Set ArrowWriter.schema property dynamically calculated from features * Update ArrowWriter.write_examples_on_file * Update ArrowWriter._build_writer * Fix code quality * Replace _schema with schema and condition on schema in ArrowWriter * Add test for MP3 TAR audio file * Refactor Audio decode_example * Pass raw bytes to torchaudio.load * Revert "Pass raw bytes to torchaudio.load" This reverts commit c973209. * Pass format to load in _decode_example_with_torchaudio * Fix filename extension in test * Fix Audio tests CI * Fix Audio tests CI * Fix audio test CI by checking out PR HEAD commit instead of merge commit * Change default Audio storage dtype to string * Rename Audio decode functions * Refactor Audio decode_example * Force CI re-run * Refactor and rename * Fix docstring * Fix docstrings
diff --git a/.github/workflows/test-audio.yml b/.github/workflows/test-audio.yml
@@ -1,9 +1,6 @@
 name: Test audio
 
 on:
-  push:
-    branches:
-    - master
   pull_request:
     branches:
     - master
@@ -12,15 +9,17 @@ jobs:
   test:
     runs-on: ubuntu-latest
     steps:
+      - name: Install OS dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install libsndfile1 sox
       - uses: actions/checkout@v2
+        with:
+          ref: ${{ github.event.pull_request.head.sha }}
       - name: Set up Python
         uses: actions/setup-python@v2
         with:
           python-version: "3.6"
-      - name: Install OS dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install libsndfile1 sox
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
diff --git a/src/datasets/arrow_writer.py b/src/datasets/arrow_writer.py
@@ -207,7 +207,7 @@ def __init__(
             raise ValueError("At least one of path and stream must be provided.")
         if features is not None:
             self._features = features
-            self._schema = pa.schema(features.type)
+            self._schema = None
         elif schema is not None:
             self._schema: pa.Schema = schema
             self._features = Features.from_arrow_schema(self._schema)
@@ -222,9 +222,7 @@ def __init__(
             self._hasher = KeyHasher("")
 
         self._check_duplicates = check_duplicates
-
-        if disable_nullable and self._schema is not None:
-            self._schema = pa.schema(pa.field(field.name, field.type, nullable=False) for field in self._schema)
+        self._disable_nullable = disable_nullable
 
         self._path = path
         if stream is None:
@@ -269,6 +267,7 @@ def close(self):
             self.stream.close()  # This also closes self.pa_writer if it is opened
 
     def _build_writer(self, inferred_schema: pa.Schema):
+        schema = self.schema
         inferred_features = Features.from_arrow_schema(inferred_schema)
         if self._features is not None:
             if self.update_features:  # keep original features it they match, or update them
@@ -279,21 +278,27 @@ def _build_writer(self, inferred_schema: pa.Schema):
                         if inferred_field == fields[name]:
                             inferred_features[name] = self._features[name]
                 self._features = inferred_features
-                self._schema: pa.Schema = inferred_schema
+                schema: pa.Schema = inferred_schema
         else:
             self._features = inferred_features
-            self._schema: pa.Schema = inferred_schema
+            schema: pa.Schema = inferred_schema
         if self.disable_nullable:
-            self._schema = pa.schema(pa.field(field.name, field.type, nullable=False) for field in self._schema)
+            schema = pa.schema(pa.field(field.name, field.type, nullable=False) for field in schema)
         if self.with_metadata:
-            self._schema = self._schema.with_metadata(
-                self._build_metadata(DatasetInfo(features=self._features), self.fingerprint)
-            )
-        self.pa_writer = pa.RecordBatchStreamWriter(self.stream, self._schema)
+            schema = schema.with_metadata(self._build_metadata(DatasetInfo(features=self._features), self.fingerprint))
+        self._schema = schema
+        self.pa_writer = pa.RecordBatchStreamWriter(self.stream, schema)
 
     @property
     def schema(self):
-        return self._schema if self._schema is not None else []
+        _schema = (
+            self._schema
+            if self._schema is not None
+            else (pa.schema(self._features.type) if self._features is not None else None)
+        )
+        if self._disable_nullable and _schema is not None:
+            _schema = pa.schema(pa.field(field.name, field.type, nullable=False) for field in _schema)
+        return _schema if _schema is not None else []
 
     @staticmethod
     def _build_metadata(info: DatasetInfo, fingerprint: Optional[str] = None) -> Dict[str, str]:
@@ -312,18 +317,18 @@ def write_examples_on_file(self):
 
         # Since current_examples contains (example, key) tuples
         cols = (
-            [col for col in self._schema.names if col in self.current_examples[0][0]]
-            + [col for col in self.current_examples[0][0].keys() if col not in self._schema.names]
-            if self._schema
+            [col for col in self.schema.names if col in self.current_examples[0][0]]
+            + [col for col in self.current_examples[0][0].keys() if col not in self.schema.names]
+            if self.schema
             else self.current_examples[0][0].keys()
         )
 
-        schema = None if self.pa_writer is None and self.update_features else self._schema
-        try_schema = self._schema if self.pa_writer is None and self.update_features else None
+        schema = None if self.pa_writer is None and self.update_features else self.schema
+        try_schema = self.schema if self.pa_writer is None and self.update_features else None
         arrays = []
         inferred_types = []
         for col in cols:
-            col_type = schema.field(col).type if schema is not None else None
+            col_type = schema.field(col).type if schema else None
             col_try_type = try_schema.field(col).type if try_schema is not None and col in try_schema.names else None
             typed_sequence = OptimizedTypedSequence(
                 [row[0][col] for row in self.current_examples], type=col_type, try_type=col_try_type, col=col
@@ -339,7 +344,7 @@ def write_examples_on_file(self):
                     )
             arrays.append(pa_array)
             inferred_types.append(inferred_type)
-        schema = pa.schema(zip(cols, inferred_types)) if self.pa_writer is None else self._schema
+        schema = pa.schema(zip(cols, inferred_types)) if self.pa_writer is None else self.schema
         table = pa.Table.from_arrays(arrays, schema=schema)
         self.write_table(table)
         self.current_examples = []
@@ -420,11 +425,11 @@ def write_batch(
         """
         if batch_examples and len(next(iter(batch_examples.values()))) == 0:
             return
-        schema = None if self.pa_writer is None and self.update_features else self._schema
-        try_schema = self._schema if self.pa_writer is None and self.update_features else None
+        schema = None if self.pa_writer is None and self.update_features else self.schema
+        try_schema = self.schema if self.pa_writer is None and self.update_features else None
         typed_sequence_examples = {}
         for col in sorted(batch_examples.keys()):
-            col_type = schema.field(col).type if schema is not None else None
+            col_type = schema.field(col).type if schema else None
             col_try_type = try_schema.field(col).type if try_schema is not None and col in try_schema.names else None
             typed_sequence = OptimizedTypedSequence(batch_examples[col], type=col_type, try_type=col_try_type, col=col)
             typed_sequence_examples[col] = typed_sequence
@@ -459,8 +464,8 @@ def finalize(self, close_stream=True):
             self.hkey_record = []
         self.write_examples_on_file()
         if self.pa_writer is None:
-            if self._schema is not None:
-                self._build_writer(self._schema)
+            if self.schema:
+                self._build_writer(self.schema)
             else:
                 raise ValueError("Please pass `features` or at least one example when writing data")
         self.pa_writer.close()
diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py
@@ -1,5 +1,6 @@
 from collections import defaultdict
 from dataclasses import dataclass, field
+from io import BytesIO
 from typing import Any, ClassVar, Optional
 
 import pyarrow as pa
@@ -11,50 +12,98 @@
 class Audio:
     """Audio Feature to extract audio data from an audio file.
 
+    Input: The Audio feature accepts as input:
+    - A :obj:`str`: Absolute path to the audio file (i.e. random access is allowed).
+    - A :obj:`dict` with the keys:
+
+        - path: String with relative path of the audio file to the archive file.
+        - bytes: Bytes content of the audio file.
+
+      This is useful for archived files with sequential access.
+
     Args:
         sampling_rate (:obj:`int`, optional): Target sampling rate. If `None`, the native sampling rate is used.
-        mono (:obj:`bool`, default ```True``): Whether to convert the audio signal to mono by averaging samples across channels.
+        mono (:obj:`bool`, default ``True``): Whether to convert the audio signal to mono by averaging samples across
+            channels.
     """
 
     sampling_rate: Optional[int] = None
     mono: bool = True
+    _storage_dtype: str = "string"
     id: Optional[str] = None
     # Automatically constructed
     dtype: ClassVar[str] = "dict"
     pa_type: ClassVar[Any] = None
     _type: str = field(default="Audio", init=False, repr=False)
 
     def __call__(self):
-        return pa.string()
+        return (
+            pa.struct({"path": pa.string(), "bytes": pa.binary()}) if self._storage_dtype == "struct" else pa.string()
+        )
+
+    def encode_example(self, value):
+        """Encode example into a format for Arrow.
+
+        Args:
+            value (:obj:`str` or :obj:`dict`): Data passed as input to Audio feature.
+
+        Returns:
+            :obj:`str` or :obj:`dict`
+        """
+        if isinstance(value, dict):
+            self._storage_dtype = "struct"
+        return value
 
     def decode_example(self, value):
         """Decode example audio file into audio data.
 
         Args:
-            value: Audio file path.
+            value (obj:`str` or :obj:`dict`): Either a string with the absolute audio file path or a dictionary with
+                keys:
+
+                - path: String with relative audio file path.
+                - bytes: Bytes of the audio file.
 
         Returns:
             dict
         """
-        # TODO: backard compatibility for users without audio dependencies
-        array, sampling_rate = (
-            self._decode_example_with_torchaudio(value)
-            if value.endswith(".mp3")
-            else self._decode_example_with_librosa(value)
-        )
-        return {"path": value, "array": array, "sampling_rate": sampling_rate}
+        path, file = (value["path"], BytesIO(value["bytes"])) if isinstance(value, dict) else (value, None)
+        if path.endswith("mp3"):
+            array, sampling_rate = self._decode_mp3(file if file else path)
+        else:
+            if file:
+                array, sampling_rate = self._decode_non_mp3_file_like(file)
+            else:
+                array, sampling_rate = self._decode_non_mp3_path_like(path)
+        return {"path": path, "array": array, "sampling_rate": sampling_rate}
 
-    def _decode_example_with_librosa(self, value):
+    def _decode_non_mp3_path_like(self, path):
         try:
             import librosa
         except ImportError as err:
             raise ImportError("To support decoding audio files, please install 'librosa'.") from err
 
-        with xopen(value, "rb") as f:
+        with xopen(path, "rb") as f:
             array, sampling_rate = librosa.load(f, sr=self.sampling_rate, mono=self.mono)
         return array, sampling_rate
 
-    def _decode_example_with_torchaudio(self, value):
+    def _decode_non_mp3_file_like(self, file):
+        try:
+            import librosa
+            import soundfile as sf
+        except ImportError as err:
+            raise ImportError("To support decoding audio files, please install 'librosa'.") from err
+
+        array, sampling_rate = sf.read(file)
+        array = array.T
+        if self.mono:
+            array = librosa.to_mono(array)
+        if self.sampling_rate and self.sampling_rate != sampling_rate:
+            array = librosa.resample(array, sampling_rate, self.sampling_rate, res_type="kaiser_best")
+            sampling_rate = self.sampling_rate
+        return array, sampling_rate
+
+    def _decode_mp3(self, path_or_file):
         try:
             import torchaudio
             import torchaudio.transforms as T
@@ -65,7 +114,7 @@ def _decode_example_with_torchaudio(self, value):
         except RuntimeError as err:
             raise ImportError("To support decoding 'mp3' audio files, please install 'sox'.") from err
 
-        array, sampling_rate = torchaudio.load(value)
+        array, sampling_rate = torchaudio.load(path_or_file, format="mp3")
         if self.sampling_rate and self.sampling_rate != sampling_rate:
             if not hasattr(self, "_resampler"):
                 self._resampler = T.Resample(sampling_rate, self.sampling_rate)
diff --git a/src/datasets/features/features.py b/src/datasets/features/features.py
@@ -849,7 +849,7 @@ def encode_nested_example(schema, obj):
             return list(obj)
     # Object with special encoding:
     # ClassLabel will convert from string to int, TranslationVariableLanguages does some checks
-    elif isinstance(schema, (ClassLabel, TranslationVariableLanguages, Value, _ArrayXD)):
+    elif isinstance(schema, (Audio, ClassLabel, TranslationVariableLanguages, Value, _ArrayXD)):
         return schema.encode_example(obj)
     # Other object should be directly convertible to a native Arrow type (like Translation and Translation)
     return obj
@@ -961,7 +961,8 @@ class Features(dict):
            :class:`datasets.Sequence`.
 
         - a :class:`Array2D`, :class:`Array3D`, :class:`Array4D` or :class:`Array5D` feature for multidimensional arrays
-        - a :class:`datasets.Audio` stores the path to an audio file and can extract audio data from it
+        - an :class:`Audio` feature to store the absolute path to an audio file or a dictionary with the relative path
+          to an audio file ("path" key) and its bytes content ("bytes" key). This feature extracts the audio data.
         - :class:`datasets.Translation` and :class:`datasets.TranslationVariableLanguages`, the two features specific to Machine Translation
     """
 
diff --git a/tests/features/test_audio.py b/tests/features/test_audio.py