Skip to content

Commit ad3b8f6

Browse files
2010YOUY01alamb
andauthored
parallel csv scan (#6801)
* parallel csv scan * add max line length * Update according to review comments * Update Configuration doc --------- Co-authored-by: Andrew Lamb <[email protected]>
1 parent 50135e8 commit ad3b8f6

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+1424
-573
lines changed

Cargo.toml

+7-21
Original file line numberDiff line numberDiff line change
@@ -17,40 +17,26 @@
1717

1818
[workspace]
1919
exclude = ["datafusion-cli"]
20-
members = [
21-
"datafusion/common",
22-
"datafusion/core",
23-
"datafusion/expr",
24-
"datafusion/execution",
25-
"datafusion/optimizer",
26-
"datafusion/physical-expr",
27-
"datafusion/proto",
28-
"datafusion/proto/gen",
29-
"datafusion/row",
30-
"datafusion/sql",
31-
"datafusion/substrait",
32-
"datafusion-examples",
33-
"test-utils",
34-
"benchmarks",
20+
members = ["datafusion/common", "datafusion/core", "datafusion/expr", "datafusion/execution", "datafusion/optimizer", "datafusion/physical-expr", "datafusion/proto", "datafusion/proto/gen", "datafusion/row", "datafusion/sql", "datafusion/substrait", "datafusion-examples", "test-utils", "benchmarks",
3521
]
3622
resolver = "2"
3723

3824
[workspace.package]
39-
version = "27.0.0"
40-
edition = "2021"
41-
readme = "README.md"
4225
authors = ["Apache Arrow <[email protected]>"]
43-
license = "Apache-2.0"
26+
edition = "2021"
4427
homepage = "https://github.com/apache/arrow-datafusion"
28+
license = "Apache-2.0"
29+
readme = "README.md"
4530
repository = "https://github.com/apache/arrow-datafusion"
4631
rust-version = "1.64"
32+
version = "27.0.0"
4733

4834
[workspace.dependencies]
4935
arrow = { version = "43.0.0", features = ["prettyprint", "dyn_cmp_dict"] }
50-
arrow-flight = { version = "43.0.0", features = ["flight-sql-experimental"] }
36+
arrow-array = { version = "43.0.0", default-features = false, features = ["chrono-tz"] }
5137
arrow-buffer = { version = "43.0.0", default-features = false }
38+
arrow-flight = { version = "43.0.0", features = ["flight-sql-experimental"] }
5239
arrow-schema = { version = "43.0.0", default-features = false }
53-
arrow-array = { version = "43.0.0", default-features = false, features = ["chrono-tz"] }
5440
parquet = { version = "43.0.0", features = ["arrow", "async", "object_store"] }
5541
sqlparser = { version = "0.35", features = ["visitor"] }
5642

datafusion-cli/Cargo.lock

+44-44
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

datafusion/common/src/config.rs

+7-4
Original file line numberDiff line numberDiff line change
@@ -323,10 +323,13 @@ config_namespace! {
323323
/// long runner execution, all types of joins may encounter out-of-memory errors.
324324
pub allow_symmetric_joins_without_pruning: bool, default = true
325325

326-
/// When set to true, file groups will be repartitioned to achieve maximum parallelism.
327-
/// Currently supported only for Parquet format in which case
328-
/// multiple row groups from the same file may be read concurrently. If false then each
329-
/// row group is read serially, though different files may be read in parallel.
326+
/// When set to `true`, file groups will be repartitioned to achieve maximum parallelism.
327+
/// Currently Parquet and CSV formats are supported.
328+
///
329+
/// If set to `true`, all files will be repartitioned evenly (i.e., a single large file
330+
/// might be partitioned into smaller chunks) for parallel scanning.
331+
/// If set to `false`, different files will be read in parallel, but repartitioning won't
332+
/// happen within a single file.
330333
pub repartition_file_scans: bool, default = true
331334

332335
/// Should DataFusion repartition data using the partitions keys to execute window

datafusion/core/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ env_logger = "0.10"
107107
half = "2.2.1"
108108
postgres-protocol = "0.6.4"
109109
postgres-types = { version = "0.2.4", features = ["derive", "with-chrono-0_4"] }
110+
regex = "1.5.4"
110111
rstest = "0.18.0"
111112
rust_decimal = { version = "1.27.0", features = ["tokio-pg"] }
112113
sqllogictest = "0.14.0"

0 commit comments

Comments
 (0)