Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: csv to document row level conversion #8916

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
36 changes: 33 additions & 3 deletions haystack/components/converters/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

import pandas as pd

from haystack import Document, component, logging
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
from haystack.dataclasses import ByteStream
Expand Down Expand Up @@ -35,7 +37,7 @@ class CSVToDocument:
```
"""

def __init__(self, encoding: str = "utf-8", store_full_path: bool = False):
def __init__(self, encoding: str = "utf-8", store_full_path: bool = False, split_by_row: bool = False):
"""
Creates a CSVToDocument component.

Expand All @@ -46,9 +48,35 @@ def __init__(self, encoding: str = "utf-8", store_full_path: bool = False):
:param store_full_path:
If True, the full path of the file is stored in the metadata of the document.
If False, only the file name is stored.
:param split_by_row:
If True, each row of the CSV file is converted into a separate document.
If False, the entire CSV file is converted into a single document.
"""
self.encoding = encoding
self.store_full_path = store_full_path
self.split_by_row = split_by_row

def _convert_file_mode(self, data: str, metadata: Dict[str, Any]) -> List[Document]:
"""Convert entire CSV file into a single document"""
return [Document(content=data, meta=metadata)]

def _convert_row_mode(self, data: str, metadata: Dict[str, Any]) -> List[Document]:
"""Convert each CSV row into a separate document"""
try:
df = pd.read_csv(io.StringIO(data))
documents = []
header = ",".join(df.columns)

for idx, row in enumerate(df.itertuples(index=False)):
row_values = ",".join(str(v) for v in row)
content = f"{header}\n{row_values}"
row_metadata = {**metadata, "row_index": idx}
doc = Document(content=content, meta=row_metadata)
documents.append(doc)
return documents
except Exception as e:
logger.warning("Error converting CSV rows to documents: {error}", error=e)
return []

@component.output_types(documents=List[Document])
def run(
Expand Down Expand Up @@ -98,7 +126,9 @@ def run(
if file_path: # Ensure the value is not None for pylint
merged_metadata["file_path"] = os.path.basename(file_path)

document = Document(content=data, meta=merged_metadata)
documents.append(document)
if self.split_by_row:
documents.extend(self._convert_row_mode(data, merged_metadata))
else:
documents.extend(self._convert_file_mode(data, merged_metadata))

return {"documents": documents}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
features:
- |
Enhance the CSVToDocument component to support row-level conversion.
- Adds a 'split_by_row' parameter to convert each row of a CSV file into a separate Haystack Document.
- Retains the header row (field names) as the first line of the 'content' in each row-level Document.
21 changes: 21 additions & 0 deletions test/components/converters/test_csv_to_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,3 +101,24 @@ def test_run_with_meta(self):

# check that the metadata from the bytestream is merged with that from the meta parameter
assert document.meta == {"name": "test_name", "language": "it"}

def test_run_split_by_row_true(self, test_files_path):
"""
Test if the component correctly splits the CSV into documents by row.
"""
file_path = test_files_path / "csv" / "sample_1.csv"
converter = CSVToDocument(split_by_row=True)
output = converter.run(sources=[file_path])
docs = output["documents"]

assert len(docs) == 3
expected_header = "Name,Age"

assert docs[0].content == f"{expected_header}\nJohn Doe,27"
assert docs[0].meta["row_index"] == 0

assert docs[1].content == f"{expected_header}\nJane Smith,37"
assert docs[1].meta["row_index"] == 1

assert docs[2].content == f"{expected_header}\nMike Johnson,47"
assert docs[2].meta["row_index"] == 2
Loading