deepset-ai · mdrazak2001 · Feb 24, 2025 · Feb 24, 2025 · Feb 28, 2025 · Feb 28, 2025
@@ -7,6 +7,8 @@
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
 
+import pandas as pd
+
 from haystack import Document, component, logging
 from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
 from haystack.dataclasses import ByteStream
@@ -35,7 +37,7 @@ class CSVToDocument:
     ```
     """
 
-    def __init__(self, encoding: str = "utf-8", store_full_path: bool = False):
+    def __init__(self, encoding: str = "utf-8", store_full_path: bool = False, split_by_row: bool = False):
         """
         Creates a CSVToDocument component.
 
@@ -46,9 +48,35 @@ def __init__(self, encoding: str = "utf-8", store_full_path: bool = False):
         :param store_full_path:
             If True, the full path of the file is stored in the metadata of the document.
             If False, only the file name is stored.
+        :param split_by_row:
+            If True, each row of the CSV file is converted into a separate document.
+            If False, the entire CSV file is converted into a single document.
         """
         self.encoding = encoding
         self.store_full_path = store_full_path
+        self.split_by_row = split_by_row
+
+    def _convert_file_mode(self, data: str, metadata: Dict[str, Any]) -> List[Document]:
+        """Convert entire CSV file into a single document"""
+        return [Document(content=data, meta=metadata)]
+
+    def _convert_row_mode(self, data: str, metadata: Dict[str, Any]) -> List[Document]:
+        """Convert each CSV row into a separate document"""
+        try:
+            df = pd.read_csv(io.StringIO(data))
+            documents = []
+            header = ",".join(df.columns)
+
+            for idx, row in enumerate(df.itertuples(index=False)):
+                row_values = ",".join(str(v) for v in row)
+                content = f"{header}\n{row_values}"
+                row_metadata = {**metadata, "row_index": idx}
+                doc = Document(content=content, meta=row_metadata)
+                documents.append(doc)
+            return documents
+        except Exception as e:
+            logger.warning("Error converting CSV rows to documents: {error}", error=e)
+            return []
 
     @component.output_types(documents=List[Document])
     def run(
@@ -98,7 +126,9 @@ def run(
                 if file_path:  # Ensure the value is not None for pylint
                     merged_metadata["file_path"] = os.path.basename(file_path)
 
-            document = Document(content=data, meta=merged_metadata)
-            documents.append(document)
+            if self.split_by_row:
+                documents.extend(self._convert_row_mode(data, merged_metadata))
+            else:
+                documents.extend(self._convert_file_mode(data, merged_metadata))
 
         return {"documents": documents}
@@ -0,0 +1,6 @@
+---
+features:
+  - |
+    Enhance the CSVToDocument component to support row-level conversion.
+    - Adds a 'split_by_row' parameter to convert each row of a CSV file into a separate Haystack Document.
+    - Retains the header row (field names) as the first line of the 'content' in each row-level Document.
@@ -101,3 +101,24 @@ def test_run_with_meta(self):
 
         # check that the metadata from the bytestream is merged with that from the meta parameter
         assert document.meta == {"name": "test_name", "language": "it"}
+
+    def test_run_split_by_row_true(self, test_files_path):
+        """
+        Test if the component correctly splits the CSV into documents by row.
+        """
+        file_path = test_files_path / "csv" / "sample_1.csv"
+        converter = CSVToDocument(split_by_row=True)
+        output = converter.run(sources=[file_path])
+        docs = output["documents"]
+
+        assert len(docs) == 3
+        expected_header = "Name,Age"
+
+        assert docs[0].content == f"{expected_header}\nJohn Doe,27"
+        assert docs[0].meta["row_index"] == 0
+
+        assert docs[1].content == f"{expected_header}\nJane Smith,37"
+        assert docs[1].meta["row_index"] == 1
+
+        assert docs[2].content == f"{expected_header}\nMike Johnson,47"
+        assert docs[2].meta["row_index"] == 2