pgpq
Convert PyArrow RecordBatches to Postgres' native binary format.
Usage
"""Example for README.md"""
from tempfile import mkdtemp
import psycopg
import pyarrow.dataset as ds
import requests
from pgpq import ArrowToPostgresBinaryEncoder
tmpdir = mkdtemp()
with open(f"{tmpdir}/yellow_tripdata_2023-01.parquet", mode="wb") as f:
resp = requests.get(
"https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet"
)
resp.raise_for_status()
f.write(resp.content)
dataset = ds.dataset(tmpdir)
encoder = ArrowToPostgresBinaryEncoder(dataset.schema)
pg_schema = encoder.schema()
cols = [f'"{col_name}" {col.data_type.ddl()}' for col_name, col in pg_schema.columns]
ddl = f"CREATE TEMP TABLE data ({','.join(cols)})"
with psycopg.connect("postgres://postgres:postgres@localhost:5432/postgres") as conn:
with conn.cursor() as cursor:
cursor.execute(ddl)
with cursor.copy("COPY data FROM STDIN WITH (FORMAT BINARY)") as copy:
copy.write(encoder.write_header())
for batch in dataset.to_batches():
copy.write(encoder.write_batch(batch))
copy.write(encoder.finish())