Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 45 additions & 12 deletions _duckdb-stubs/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,8 @@ __all__: list[str] = [
"execute",
"executemany",
"extract_statements",
"to_arrow_reader",
"to_arrow_table",
"fetch_arrow_table",
"fetch_df",
"fetch_df_chunk",
Expand Down Expand Up @@ -194,7 +196,11 @@ class DuckDBPyConnection:
def __exit__(self, exc_type: object, exc: object, traceback: object) -> None: ...
def append(self, table_name: str, df: pandas.DataFrame, *, by_name: bool = False) -> DuckDBPyConnection: ...
def array_type(self, type: sqltypes.DuckDBPyType, size: pytyping.SupportsInt) -> sqltypes.DuckDBPyType: ...
def arrow(self, rows_per_batch: pytyping.SupportsInt = 1000000) -> pyarrow.lib.RecordBatchReader: ...
def arrow(self, rows_per_batch: pytyping.SupportsInt = 1000000) -> pyarrow.lib.RecordBatchReader:
"""Alias of to_arrow_reader(). We recommend using to_arrow_reader() instead."""
...
def to_arrow_reader(self, batch_size: pytyping.SupportsInt = 1000000) -> pyarrow.lib.RecordBatchReader: ...
def to_arrow_table(self, batch_size: pytyping.SupportsInt = 1000000) -> pyarrow.lib.Table: ...
def begin(self) -> DuckDBPyConnection: ...
def checkpoint(self) -> DuckDBPyConnection: ...
def close(self) -> None: ...
Expand Down Expand Up @@ -222,12 +228,16 @@ class DuckDBPyConnection:
def execute(self, query: Statement | str, parameters: object = None) -> DuckDBPyConnection: ...
def executemany(self, query: Statement | str, parameters: object = None) -> DuckDBPyConnection: ...
def extract_statements(self, query: str) -> list[Statement]: ...
def fetch_arrow_table(self, rows_per_batch: pytyping.SupportsInt = 1000000) -> pyarrow.lib.Table: ...
def fetch_arrow_table(self, rows_per_batch: pytyping.SupportsInt = 1000000) -> pyarrow.lib.Table:
"""Deprecated: use to_arrow_table() instead."""
...
def fetch_df(self, *, date_as_object: bool = False) -> pandas.DataFrame: ...
def fetch_df_chunk(
self, vectors_per_chunk: pytyping.SupportsInt = 1, *, date_as_object: bool = False
) -> pandas.DataFrame: ...
def fetch_record_batch(self, rows_per_batch: pytyping.SupportsInt = 1000000) -> pyarrow.lib.RecordBatchReader: ...
def fetch_record_batch(self, rows_per_batch: pytyping.SupportsInt = 1000000) -> pyarrow.lib.RecordBatchReader:
"""Deprecated: use to_arrow_reader() instead."""
...
def fetchall(self) -> list[tuple[pytyping.Any, ...]]: ...
def fetchdf(self, *, date_as_object: bool = False) -> pandas.DataFrame: ...
def fetchmany(self, size: pytyping.SupportsInt = 1) -> list[tuple[pytyping.Any, ...]]: ...
Expand Down Expand Up @@ -487,7 +497,11 @@ class DuckDBPyRelation:
def arg_min(
self, arg_column: str, value_column: str, groups: str = "", window_spec: str = "", projected_columns: str = ""
) -> DuckDBPyRelation: ...
def arrow(self, batch_size: pytyping.SupportsInt = 1000000) -> pyarrow.lib.RecordBatchReader: ...
def arrow(self, batch_size: pytyping.SupportsInt = 1000000) -> pyarrow.lib.RecordBatchReader:
"""Alias of to_arrow_reader(). We recommend using to_arrow_reader() instead."""
...
def to_arrow_reader(self, batch_size: pytyping.SupportsInt = 1000000) -> pyarrow.lib.RecordBatchReader: ...
def to_arrow_table(self, batch_size: pytyping.SupportsInt = 1000000) -> pyarrow.lib.Table: ...
def avg(
self, column: str, groups: str = "", window_spec: str = "", projected_columns: str = ""
) -> DuckDBPyRelation: ...
Expand Down Expand Up @@ -533,12 +547,18 @@ class DuckDBPyRelation:
def favg(
self, column: str, groups: str = "", window_spec: str = "", projected_columns: str = ""
) -> DuckDBPyRelation: ...
def fetch_arrow_reader(self, batch_size: pytyping.SupportsInt = 1000000) -> pyarrow.lib.RecordBatchReader: ...
def fetch_arrow_table(self, batch_size: pytyping.SupportsInt = 1000000) -> pyarrow.lib.Table: ...
def fetch_arrow_reader(self, batch_size: pytyping.SupportsInt = 1000000) -> pyarrow.lib.RecordBatchReader:
"""Deprecated: use to_arrow_reader() instead."""
...
def fetch_arrow_table(self, batch_size: pytyping.SupportsInt = 1000000) -> pyarrow.lib.Table:
"""Deprecated: use to_arrow_table() instead."""
...
def fetch_df_chunk(
self, vectors_per_chunk: pytyping.SupportsInt = 1, *, date_as_object: bool = False
) -> pandas.DataFrame: ...
def fetch_record_batch(self, rows_per_batch: pytyping.SupportsInt = 1000000) -> pyarrow.lib.RecordBatchReader: ...
def fetch_record_batch(self, rows_per_batch: pytyping.SupportsInt = 1000000) -> pyarrow.lib.RecordBatchReader:
"""Deprecated: use to_arrow_reader() instead."""
...
def fetchall(self) -> list[tuple[pytyping.Any, ...]]: ...
def fetchdf(self, *, date_as_object: bool = False) -> pandas.DataFrame: ...
def fetchmany(self, size: pytyping.SupportsInt = 1) -> list[tuple[pytyping.Any, ...]]: ...
Expand Down Expand Up @@ -656,7 +676,6 @@ class DuckDBPyRelation:
def query(self, virtual_table_name: str, sql_query: str) -> DuckDBPyRelation: ...
def rank(self, window_spec: str, projected_columns: str = "") -> DuckDBPyRelation: ...
def rank_dense(self, window_spec: str, projected_columns: str = "") -> DuckDBPyRelation: ...
def record_batch(self, batch_size: pytyping.SupportsInt = 1000000) -> pyarrow.RecordBatchReader: ...
def row_number(self, window_spec: str, projected_columns: str = "") -> DuckDBPyRelation: ...
def select(self, *args: str | Expression, groups: str = "") -> DuckDBPyRelation: ...
def select_dtypes(self, types: pytyping.List[sqltypes.DuckDBPyType | str]) -> DuckDBPyRelation: ...
Expand Down Expand Up @@ -692,7 +711,6 @@ class DuckDBPyRelation:
self, column: str, groups: str = "", window_spec: str = "", projected_columns: str = ""
) -> DuckDBPyRelation: ...
def tf(self) -> dict[str, tensorflow.Tensor]: ...
def to_arrow_table(self, batch_size: pytyping.SupportsInt = 1000000) -> pyarrow.lib.Table: ...
def to_csv(
self,
file_name: str,
Expand Down Expand Up @@ -1067,9 +1085,18 @@ def array_type(
@pytyping.overload
def arrow(
rows_per_batch: pytyping.SupportsInt = 1000000, *, connection: DuckDBPyConnection | None = None
) -> pyarrow.lib.RecordBatchReader: ...
) -> pyarrow.lib.RecordBatchReader:
"""Alias of to_arrow_reader(). We recommend using to_arrow_reader() instead."""
...

@pytyping.overload
def arrow(arrow_object: pytyping.Any, *, connection: DuckDBPyConnection | None = None) -> DuckDBPyRelation: ...
def to_arrow_reader(
batch_size: pytyping.SupportsInt = 1000000, *, connection: DuckDBPyConnection | None = None
) -> pyarrow.lib.RecordBatchReader: ...
def to_arrow_table(
batch_size: pytyping.SupportsInt = 1000000, *, connection: DuckDBPyConnection | None = None
) -> pyarrow.lib.Table: ...
def begin(*, connection: DuckDBPyConnection | None = None) -> DuckDBPyConnection: ...
def checkpoint(*, connection: DuckDBPyConnection | None = None) -> DuckDBPyConnection: ...
def close(*, connection: DuckDBPyConnection | None = None) -> None: ...
Expand Down Expand Up @@ -1128,7 +1155,10 @@ def executemany(
def extract_statements(query: str, *, connection: DuckDBPyConnection | None = None) -> list[Statement]: ...
def fetch_arrow_table(
rows_per_batch: pytyping.SupportsInt = 1000000, *, connection: DuckDBPyConnection | None = None
) -> pyarrow.lib.Table: ...
) -> pyarrow.lib.Table:
"""Deprecated: use to_arrow_table() instead."""
...

def fetch_df(*, date_as_object: bool = False, connection: DuckDBPyConnection | None = None) -> pandas.DataFrame: ...
def fetch_df_chunk(
vectors_per_chunk: pytyping.SupportsInt = 1,
Expand All @@ -1138,7 +1168,10 @@ def fetch_df_chunk(
) -> pandas.DataFrame: ...
def fetch_record_batch(
rows_per_batch: pytyping.SupportsInt = 1000000, *, connection: DuckDBPyConnection | None = None
) -> pyarrow.lib.RecordBatchReader: ...
) -> pyarrow.lib.RecordBatchReader:
"""Deprecated: use to_arrow_reader() instead."""
...

def fetchall(*, connection: DuckDBPyConnection | None = None) -> list[tuple[pytyping.Any, ...]]: ...
def fetchdf(*, date_as_object: bool = False, connection: DuckDBPyConnection | None = None) -> pandas.DataFrame: ...
def fetchmany(
Expand Down
4 changes: 4 additions & 0 deletions duckdb/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,8 @@
table_function,
tf,
threadsafety,
to_arrow_reader,
to_arrow_table,
token_type,
tokenize,
torch,
Expand Down Expand Up @@ -374,6 +376,8 @@
"tf",
"threadsafety",
"threadsafety",
"to_arrow_reader",
"to_arrow_table",
"token_type",
"tokenize",
"torch",
Expand Down
5 changes: 1 addition & 4 deletions duckdb/polars_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,10 +270,7 @@ def source_generator(
# Try to pushdown filter, if one exists
if duck_predicate is not None:
relation_final = relation_final.filter(duck_predicate)
if batch_size is None:
results = relation_final.fetch_arrow_reader()
else:
results = relation_final.fetch_arrow_reader(batch_size)
results = relation_final.to_arrow_reader() if batch_size is None else relation_final.to_arrow_reader(batch_size)

for record_batch in iter(results.read_next_batch, None):
if predicate is not None and duck_predicate is None:
Expand Down
2 changes: 1 addition & 1 deletion external/duckdb
Submodule duckdb updated 293 files
2 changes: 1 addition & 1 deletion scripts/connection_methods.json
Original file line number Diff line number Diff line change
Expand Up @@ -395,7 +395,7 @@
"return": "polars.DataFrame"
},
{
"name": "fetch_arrow_table",
"name": "arrow_table",
"function": "FetchArrow",
"docs": "Fetch a result as Arrow table following execute()",
"args": [
Expand Down
40 changes: 27 additions & 13 deletions src/duckdb_py/duckdb_python.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -446,31 +446,45 @@ static void InitializeConnectionMethods(py::module_ &m) {
"Fetch a result as Polars DataFrame following execute()", py::arg("rows_per_batch") = 1000000, py::kw_only(),
py::arg("lazy") = false, py::arg("connection") = py::none());
m.def(
"fetch_arrow_table",
[](idx_t rows_per_batch, shared_ptr<DuckDBPyConnection> conn = nullptr) {
"to_arrow_table",
[](idx_t batch_size, shared_ptr<DuckDBPyConnection> conn = nullptr) {
if (!conn) {
conn = DuckDBPyConnection::DefaultConnection();
}
return conn->FetchArrow(rows_per_batch);
return conn->FetchArrow(batch_size);
},
"Fetch a result as Arrow table following execute()", py::arg("rows_per_batch") = 1000000, py::kw_only(),
"Fetch a result as Arrow table following execute()", py::arg("batch_size") = 1000000, py::kw_only(),
py::arg("connection") = py::none());
m.def(
"fetch_record_batch",
[](const idx_t rows_per_batch, shared_ptr<DuckDBPyConnection> conn = nullptr) {
"to_arrow_reader",
[](idx_t batch_size, shared_ptr<DuckDBPyConnection> conn = nullptr) {
if (!conn) {
conn = DuckDBPyConnection::DefaultConnection();
}
return conn->FetchRecordBatchReader(rows_per_batch);
return conn->FetchRecordBatchReader(batch_size);
},
"Fetch an Arrow RecordBatchReader following execute()", py::arg("rows_per_batch") = 1000000, py::kw_only(),
"Fetch an Arrow RecordBatchReader following execute()", py::arg("batch_size") = 1000000, py::kw_only(),
py::arg("connection") = py::none());
m.def(
"arrow",
"fetch_arrow_table",
[](idx_t rows_per_batch, shared_ptr<DuckDBPyConnection> conn = nullptr) {
if (!conn) {
conn = DuckDBPyConnection::DefaultConnection();
}
PyErr_WarnEx(PyExc_DeprecationWarning, "fetch_arrow_table() is deprecated, use to_arrow_table() instead.",
0);
return conn->FetchArrow(rows_per_batch);
},
"Fetch a result as Arrow table following execute()", py::arg("rows_per_batch") = 1000000, py::kw_only(),
py::arg("connection") = py::none());
m.def(
"fetch_record_batch",
[](const idx_t rows_per_batch, shared_ptr<DuckDBPyConnection> conn = nullptr) {
if (!conn) {
conn = DuckDBPyConnection::DefaultConnection();
}
PyErr_WarnEx(PyExc_DeprecationWarning, "fetch_record_batch() is deprecated, use to_arrow_reader() instead.",
0);
return conn->FetchRecordBatchReader(rows_per_batch);
},
"Fetch an Arrow RecordBatchReader following execute()", py::arg("rows_per_batch") = 1000000, py::kw_only(),
Expand Down Expand Up @@ -957,14 +971,14 @@ static void InitializeConnectionMethods(py::module_ &m) {
// We define these "wrapper" methods manually because they are overloaded
m.def(
"arrow",
[](idx_t rows_per_batch, shared_ptr<DuckDBPyConnection> conn) -> duckdb::pyarrow::Table {
[](idx_t rows_per_batch, shared_ptr<DuckDBPyConnection> conn) -> duckdb::pyarrow::RecordBatchReader {
if (!conn) {
conn = DuckDBPyConnection::DefaultConnection();
}
return conn->FetchArrow(rows_per_batch);
return conn->FetchRecordBatchReader(rows_per_batch);
},
"Fetch a result as Arrow table following execute()", py::arg("rows_per_batch") = 1000000, py::kw_only(),
py::arg("connection") = py::none());
"Alias of to_arrow_reader(). We recommend using to_arrow_reader() instead.",
py::arg("rows_per_batch") = 1000000, py::kw_only(), py::arg("connection") = py::none());
m.def(
"arrow",
[](py::object &arrow_object, shared_ptr<DuckDBPyConnection> conn) -> unique_ptr<DuckDBPyRelation> {
Expand Down
27 changes: 22 additions & 5 deletions src/duckdb_py/pyconnection.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -203,11 +203,28 @@ static void InitializeConnectionMethods(py::class_<DuckDBPyConnection, shared_pt
py::kw_only(), py::arg("date_as_object") = false);
m.def("pl", &DuckDBPyConnection::FetchPolars, "Fetch a result as Polars DataFrame following execute()",
py::arg("rows_per_batch") = 1000000, py::kw_only(), py::arg("lazy") = false);
m.def("fetch_arrow_table", &DuckDBPyConnection::FetchArrow, "Fetch a result as Arrow table following execute()",
py::arg("rows_per_batch") = 1000000);
m.def("fetch_record_batch", &DuckDBPyConnection::FetchRecordBatchReader,
"Fetch an Arrow RecordBatchReader following execute()", py::arg("rows_per_batch") = 1000000);
m.def("arrow", &DuckDBPyConnection::FetchRecordBatchReader, "Fetch an Arrow RecordBatchReader following execute()",
m.def("to_arrow_table", &DuckDBPyConnection::FetchArrow, "Fetch a result as Arrow table following execute()",
py::arg("batch_size") = 1000000);
m.def("to_arrow_reader", &DuckDBPyConnection::FetchRecordBatchReader,
"Fetch an Arrow RecordBatchReader following execute()", py::arg("batch_size") = 1000000);
m.def(
"fetch_arrow_table",
[](DuckDBPyConnection &self, idx_t rows_per_batch) {
PyErr_WarnEx(PyExc_DeprecationWarning, "fetch_arrow_table() is deprecated, use to_arrow_table() instead.",
0);
return self.FetchArrow(rows_per_batch);
},
"Fetch a result as Arrow table following execute()", py::arg("rows_per_batch") = 1000000);
m.def(
"fetch_record_batch",
[](DuckDBPyConnection &self, idx_t rows_per_batch) {
PyErr_WarnEx(PyExc_DeprecationWarning, "fetch_record_batch() is deprecated, use to_arrow_reader() instead.",
0);
return self.FetchRecordBatchReader(rows_per_batch);
},
"Fetch an Arrow RecordBatchReader following execute()", py::arg("rows_per_batch") = 1000000);
m.def("arrow", &DuckDBPyConnection::FetchRecordBatchReader,
"Alias of to_arrow_reader(). We recommend using to_arrow_reader() instead.",
py::arg("rows_per_batch") = 1000000);
m.def("torch", &DuckDBPyConnection::FetchPyTorch, "Fetch a result as dict of PyTorch Tensors following execute()");
m.def("tf", &DuckDBPyConnection::FetchTF, "Fetch a result as dict of TensorFlow Tensors following execute()");
Expand Down
42 changes: 29 additions & 13 deletions src/duckdb_py/pyrelation/initialize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,12 +62,21 @@ static void InitializeConsumers(py::class_<DuckDBPyRelation> &m) {
py::arg("date_as_object") = false)
.def("fetch_df_chunk", &DuckDBPyRelation::FetchDFChunk, "Execute and fetch a chunk of the rows",
py::arg("vectors_per_chunk") = 1, py::kw_only(), py::arg("date_as_object") = false)
.def("arrow", &DuckDBPyRelation::ToRecordBatch,
"Execute and return an Arrow Record Batch Reader that yields all rows", py::arg("batch_size") = 1000000)
.def("fetch_arrow_table", &DuckDBPyRelation::ToArrowTable, "Execute and fetch all rows as an Arrow Table",
py::arg("batch_size") = 1000000)
.def("to_arrow_table", &DuckDBPyRelation::ToArrowTable, "Execute and fetch all rows as an Arrow Table",
py::arg("batch_size") = 1000000)
.def("to_arrow_reader", &DuckDBPyRelation::ToRecordBatch,
"Execute and return an Arrow Record Batch Reader that yields all rows", py::arg("batch_size") = 1000000)
.def("arrow", &DuckDBPyRelation::ToRecordBatch,
"Alias of to_arrow_reader(). We recommend using to_arrow_reader() instead.",
py::arg("batch_size") = 1000000)
.def(
"fetch_arrow_table",
[](pybind11::object &self, idx_t batch_size) {
PyErr_WarnEx(PyExc_DeprecationWarning,
"fetch_arrow_table() is deprecated, use to_arrow_table() instead.", 0);
return self.attr("to_arrow_table")(batch_size);
},
"Execute and fetch all rows as an Arrow Table", py::arg("batch_size") = 1000000)
.def("pl", &DuckDBPyRelation::ToPolars, "Execute and fetch all rows as a Polars DataFrame",
py::arg("batch_size") = 1000000, py::kw_only(), py::arg("lazy") = false)
.def("torch", &DuckDBPyRelation::FetchPyTorch, "Fetch a result as dict of PyTorch Tensors")
Expand All @@ -79,18 +88,25 @@ static void InitializeConsumers(py::class_<DuckDBPyRelation> &m) {
)";
m.def("__arrow_c_stream__", &DuckDBPyRelation::ToArrowCapsule, capsule_docs,
py::arg("requested_schema") = py::none());
m.def("fetch_record_batch", &DuckDBPyRelation::ToRecordBatch,
"Execute and return an Arrow Record Batch Reader that yields all rows", py::arg("rows_per_batch") = 1000000)
.def("fetch_arrow_reader", &DuckDBPyRelation::ToRecordBatch,
"Execute and return an Arrow Record Batch Reader that yields all rows", py::arg("batch_size") = 1000000)
m.def(
"fetch_record_batch",
[](pybind11::object &self, idx_t rows_per_batch) {
PyErr_WarnEx(PyExc_DeprecationWarning,
"fetch_record_batch() is deprecated, use to_arrow_reader() instead.", 0);
return self.attr("to_arrow_reader")(rows_per_batch);
},
"Execute and return an Arrow Record Batch Reader that yields all rows", py::arg("rows_per_batch") = 1000000)
.def(
"record_batch",
[](pybind11::object &self, idx_t rows_per_batch) {
"fetch_arrow_reader",
[](pybind11::object &self, idx_t batch_size) {
PyErr_WarnEx(PyExc_DeprecationWarning,
"record_batch() is deprecated, use fetch_record_batch() instead.", 0);
return self.attr("fetch_record_batch")(rows_per_batch);
"fetch_arrow_reader() is deprecated, use to_arrow_reader() instead.", 0);
if (PyErr_Occurred()) {
throw py::error_already_set();
}
return self.attr("to_arrow_reader")(batch_size);
},
py::arg("batch_size") = 1000000);
"Execute and return an Arrow Record Batch Reader that yields all rows", py::arg("batch_size") = 1000000);
}

static void InitializeAggregates(py::class_<DuckDBPyRelation> &m) {
Expand Down
6 changes: 3 additions & 3 deletions tests/fast/api/test_dbapi_fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,11 @@ def test_multiple_fetch_arrow(self, duckdb_cursor):
pytest.importorskip("pyarrow")
con = duckdb.connect()
c = con.execute("SELECT 42::BIGINT AS a")
table = c.fetch_arrow_table()
table = c.to_arrow_table()
df = table.to_pandas()
pd.testing.assert_frame_equal(df, pd.DataFrame.from_dict({"a": [42]}))
assert c.fetch_arrow_table() is None
assert c.fetch_arrow_table() is None
assert c.to_arrow_table() is None
assert c.to_arrow_table() is None

def test_multiple_close(self, duckdb_cursor):
con = duckdb.connect()
Expand Down
Loading