Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

nodejs-polars

Package Overview
Dependencies
Maintainers
2
Versions
45
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

nodejs-polars - npm Package Compare versions

Comparing version 0.2.1 to 0.3.0

22

bin/dataframe.d.ts

@@ -300,2 +300,24 @@ /// <reference types="node" />

/**
*
*
* __Extend the memory backed by this `DataFrame` with the values from `other`.__
* ___
Different from `vstack` which adds the chunks from `other` to the chunks of this `DataFrame`
`extent` appends the data from `other` to the underlying memory locations and thus may cause a reallocation.
If this does not cause a reallocation, the resulting data structure will not have any extra chunks
and thus will yield faster queries.
Prefer `extend` over `vstack` when you want to do a query after a single append. For instance during
online operations where you add `n` rows and rerun a query.
Prefer `vstack` over `extend` when you want to append many times before doing a query. For instance
when you read in multiple files and when to store them in a single `DataFrame`.
In the latter case, finish the sequence of `vstack` operations with a `rechunk`.
* @param other DataFrame to vertically add.
*/
extend(other: DataFrame): DataFrame;
/**
* Fill null/missing values by a filling strategy

@@ -302,0 +324,0 @@ *

3

bin/dataframe.js

@@ -180,2 +180,5 @@ "use strict";

},
extend(other) {
return wrap("extend", { other: other._df });
},
filter(predicate) {

@@ -182,0 +185,0 @@ return this

2

bin/index.d.ts

@@ -5,3 +5,3 @@ import * as series from "./series/series";

import * as func from "./functions";
import io from "./io";
import * as io from "./io";
import * as cfg from "./cfg";

@@ -8,0 +8,0 @@ import type { FillNullStrategy as _FillNullStrategy } from "./utils";

@@ -21,5 +21,2 @@ "use strict";

};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
const series = __importStar(require("./series/series"));

@@ -29,3 +26,3 @@ const df = __importStar(require("./dataframe"));

const func = __importStar(require("./functions"));
const io_1 = __importDefault(require("./io"));
const io = __importStar(require("./io"));
const cfg = __importStar(require("./cfg"));

@@ -61,11 +58,11 @@ const package_json_1 = require("../package.json");

// IO
pl.scanCSV = io_1.default.scanCSV;
pl.scanIPC = io_1.default.scanIPC;
pl.scanParquet = io_1.default.scanParquet;
pl.readCSV = io_1.default.readCSV;
pl.readIPC = io_1.default.readIPC;
pl.readJSON = io_1.default.readJSON;
pl.readParquet = io_1.default.readParquet;
pl.readCSVStream = io_1.default.readCSVStream;
pl.readJSONStream = io_1.default.readJSONStream;
pl.scanCSV = io.scanCSV;
pl.scanIPC = io.scanIPC;
pl.scanParquet = io.scanParquet;
pl.readCSV = io.readCSV;
pl.readIPC = io.readIPC;
pl.readJSON = io.readJSON;
pl.readParquet = io.readParquet;
pl.readCSVStream = io.readCSVStream;
pl.readJSONStream = io.readJSONStream;
// lazy

@@ -72,0 +69,0 @@ pl.col = lazy_1.funcs.col;

@@ -16,3 +16,8 @@ /// <reference types="node" />

rechunk?: boolean;
rowCount?: RowCount;
};
declare type RowCount = {
name: string;
offset?: number;
};
declare type ReadCsvOptions = {

@@ -36,2 +41,3 @@ batchSize?: number;

startRows?: number;
rowCount?: RowCount;
};

@@ -48,2 +54,3 @@ declare type ReadJsonOptions = {

rechunk?: boolean;
rowCount?: RowCount;
};

@@ -54,277 +61,247 @@ declare type ReadIPCOptions = {

numRows?: number;
rowCount?: RowCount;
};
declare namespace io {
/**
* __Read a CSV file or string into a Dataframe.__
* ___
* @param pathOrBody - path or buffer or string
* - path: Path to a file or a file like string. Any valid filepath can be used. Example: `file.csv`.
* - body: String or buffer to be read as a CSV
* @param options
* @param options.inferSchemaLength -Maximum number of lines to read to infer schema. If set to 0, all columns will be read as pl.Utf8.
* If set to `null`, a full table scan will be done (slow).
* @param options.batchSize - Number of lines to read into the buffer at once. Modify this to change performance.
* @param options.hasHeader - Indicate if first row of dataset is header or not. If set to False first row will be set to `column_x`,
* `x` being an enumeration over every column in the dataset.
* @param options.ignoreErrors -Try to keep reading lines if some lines yield errors.
* @param options.endRows -After n rows are read from the CSV, it stops reading.
* During multi-threaded parsing, an upper bound of `n` rows
* cannot be guaranteed.
* @param options.startRows -Start reading after `startRows` position.
* @param options.projection -Indices of columns to select. Note that column indices start at zero.
* @param options.sep -Character to use as delimiter in the file.
* @param options.columns -Columns to select.
* @param options.rechunk -Make sure that all columns are contiguous in memory by aggregating the chunks into a single array.
* @param options.encoding -Allowed encodings: `utf8`, `utf8-lossy`. Lossy means that invalid utf8 values are replaced with `�` character.
* @param options.numThreads -Number of threads to use in csv parsing. Defaults to the number of physical cpu's of your system.
* @param options.dtype -Overwrite the dtypes during inference.
* @param options.lowMemory - Reduce memory usage in expense of performance.
* @param options.commentChar - character that indicates the start of a comment line, for instance '#'.
* @param options.quotChar -character that is used for csv quoting, default = ''. Set to null to turn special handling and escaping of quotes off.
* @param options.nullValues - Values to interpret as null values. You can provide a
* - `string` -> all values encountered equal to this string will be null
* - `Array<string>` -> A null value per column.
* - `Record<string,string>` -> An object or map that maps column name to a null value string.Ex. {"column_1": 0}
* @param options.parseDates -Whether to attempt to parse dates or not
* @returns DataFrame
*/
interface readCSV {
(pathOrBody: string | Buffer, options?: Partial<ReadCsvOptions>): DataFrame;
}
/**
* __Lazily read from a CSV file or multiple files via glob patterns.__
*
* This allows the query optimizer to push down predicates and
* projections to the scan level, thereby potentially reducing
* memory overhead.
* ___
* @param path path to a file
* @param options.hasHeader - Indicate if first row of dataset is header or not. If set to False first row will be set to `column_x`,
* `x` being an enumeration over every column in the dataset.
* @param options.sep -Character to use as delimiter in the file.
* @param options.commentChar - character that indicates the start of a comment line, for instance '#'.
* @param options.quotChar -character that is used for csv quoting, default = ''. Set to null to turn special handling and escaping of quotes off.
* @param options.startRows -Start reading after `startRows` position.
* @param options.nullValues - Values to interpret as null values. You can provide a
* - `string` -> all values encountered equal to this string will be null
* - `Array<string>` -> A null value per column.
* - `Record<string,string>` -> An object or map that maps column name to a null value string.Ex. {"column_1": 0}
* @param options.ignoreErrors -Try to keep reading lines if some lines yield errors.
* @param options.cache Cache the result after reading.
* @param options.inferSchemaLength -Maximum number of lines to read to infer schema. If set to 0, all columns will be read as pl.Utf8.
* If set to `null`, a full table scan will be done (slow).
* @param options.batchSize - Number of lines to read into the buffer at once. Modify this to change performance.
* @param options.endRows -After n rows are read from the CSV, it stops reading.
* During multi-threaded parsing, an upper bound of `n` rows
* cannot be guaranteed.
* @param options.rechunk -Make sure that all columns are contiguous in memory by aggregating the chunks into a single array.
* @param options.lowMemory - Reduce memory usage in expense of performance.
* ___
*
*/
interface scanCSV {
(path: string, options?: Partial<ReadCsvOptions>): LazyDataFrame;
}
/**
* __Read a JSON file or string into a DataFrame.__
*
* _Note: Currently only newline delimited JSON is supported_
* @param pathOrBody - path or buffer or string
* - path: Path to a file or a file like string. Any valid filepath can be used. Example: `file.csv`.
* - body: String or buffer to be read as a CSV
* @param options
* @param options.inferSchemaLength -Maximum number of lines to read to infer schema. If set to 0, all columns will be read as pl.Utf8.
* If set to `null`, a full table scan will be done (slow).
* @param options.batchSize - Number of lines to read into the buffer at once. Modify this to change performance.
* @returns ({@link DataFrame})
* @example
* ```
* const jsonString = `
* {"a", 1, "b", "foo", "c": 3}
* {"a": 2, "b": "bar", "c": 6}
* `
* > const df = pl.readJSON(jsonString)
* > console.log(df)
* shape: (2, 3)
* ╭─────┬─────┬─────╮
* │ a ┆ b ┆ c │
* │ --- ┆ --- ┆ --- │
* │ i64 ┆ str ┆ i64 │
* ╞═════╪═════╪═════╡
* │ 1 ┆ foo ┆ 3 │
* ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
* │ 2 ┆ bar ┆ 6 │
* ╰─────┴─────┴─────╯
* ```
*/
interface readJSON {
(pathOrBody: string | Buffer, options?: Partial<ReadJsonOptions>): DataFrame;
}
/**
* Read into a DataFrame from a parquet file.
* @param pathOrBuffer
* Path to a file, list of files, or a file like object. If the path is a directory, that directory will be used
* as partition aware scan.
* @param options.columns Columns to select. Accepts a list of column names.
* @param options.numRows Stop reading from parquet file after reading ``n_rows``.
* @param options.parallel Read the parquet file in parallel. The single threaded reader consumes less memory.
*/
interface readParquet {
(pathOrBody: string | Buffer, options?: ReadParquetOptions): DataFrame;
}
/**
* __Lazily read from a parquet file or multiple files via glob patterns.__
* ___
* This allows the query optimizer to push down predicates and projections to the scan level,
* thereby potentially reducing memory overhead.
* @param path Path to a file or or glob pattern
* @param options.numRows Stop reading from parquet file after reading ``n_rows``.
* @param options.cache Cache the result after reading.
* @param options.parallel Read the parquet file in parallel. The single threaded reader consumes less memory.
* @param options.rechunk In case of reading multiple files via a glob pattern rechunk the final DataFrame into contiguous memory chunks.
*/
interface scanParquet {
(path: string, options?: ScanParquetOptions): LazyDataFrame;
}
/**
* __Read into a DataFrame from Arrow IPC (Feather v2) file.__
* ___
* @param pathOrBody - path or buffer or string
* - path: Path to a file or a file like string. Any valid filepath can be used. Example: `file.ipc`.
* - body: String or buffer to be read as Arrow IPC
* @param options.columns Columns to select. Accepts a list of column names.
* @param options.numRows Stop reading from parquet file after reading ``n_rows``.
*/
interface readIPC {
(pathOrBody: string | Buffer, options?: ReadIPCOptions): DataFrame;
}
/**
* __Lazily read from an Arrow IPC (Feather v2) file or multiple files via glob patterns.__
* ___
* @param path Path to a IPC file.
* @param options.numRows Stop reading from IPC file after reading ``numRows``
* @param options.cache Cache the result after reading.
* @param options.rechunk Reallocate to contiguous memory when all chunks/ files are parsed.
*/
interface scanIPC {
(path: string, options?: ScanIPCOptions): LazyDataFrame;
}
/**
* __Read a stream into a Dataframe.__
*
* **Warning:** this is much slower than `scanCSV` or `readCSV`
*
* This will consume the entire stream into a single buffer and then call `readCSV`
* Only use it when you must consume from a stream, or when performance is not a major consideration
*
* ___
* @param stream - readable stream containing csv data
* @param options
* @param options.inferSchemaLength -Maximum number of lines to read to infer schema. If set to 0, all columns will be read as pl.Utf8.
* If set to `null`, a full table scan will be done (slow).
* @param options.batchSize - Number of lines to read into the buffer at once. Modify this to change performance.
* @param options.hasHeader - Indicate if first row of dataset is header or not. If set to False first row will be set to `column_x`,
* `x` being an enumeration over every column in the dataset.
* @param options.ignoreErrors -Try to keep reading lines if some lines yield errors.
* @param options.endRows -After n rows are read from the CSV, it stops reading.
* During multi-threaded parsing, an upper bound of `n` rows
* cannot be guaranteed.
* @param options.startRows -Start reading after `startRows` position.
* @param options.projection -Indices of columns to select. Note that column indices start at zero.
* @param options.sep -Character to use as delimiter in the file.
* @param options.columns -Columns to select.
* @param options.rechunk -Make sure that all columns are contiguous in memory by aggregating the chunks into a single array.
* @param options.encoding -Allowed encodings: `utf8`, `utf8-lossy`. Lossy means that invalid utf8 values are replaced with `�` character.
* @param options.numThreads -Number of threads to use in csv parsing. Defaults to the number of physical cpu's of your system.
* @param options.dtype -Overwrite the dtypes during inference.
* @param options.lowMemory - Reduce memory usage in expense of performance.
* @param options.commentChar - character that indicates the start of a comment line, for instance '#'.
* @param options.quotChar -character that is used for csv quoting, default = ''. Set to null to turn special handling and escaping of quotes off.
* @param options.nullValues - Values to interpret as null values. You can provide a
* - `string` -> all values encountered equal to this string will be null
* - `Array<string>` -> A null value per column.
* - `Record<string,string>` -> An object or map that maps column name to a null value string.Ex. {"column_1": 0}
* @param options.parseDates -Whether to attempt to parse dates or not
* @returns Promise<DataFrame>
*
* @example
* ```
* >>> const readStream = new Stream.Readable({read(){}});
* >>> readStream.push(`a,b\n`);
* >>> readStream.push(`1,2\n`);
* >>> readStream.push(`2,2\n`);
* >>> readStream.push(`3,2\n`);
* >>> readStream.push(`4,2\n`);
* >>> readStream.push(null);
*
* >>> pl.readCSVStream(readStream).then(df => console.log(df));
* shape: (4, 2)
* ┌─────┬─────┐
* │ a ┆ b │
* │ --- ┆ --- │
* │ i64 ┆ i64 │
* ╞═════╪═════╡
* │ 1 ┆ 2 │
* ├╌╌╌╌╌┼╌╌╌╌╌┤
* │ 2 ┆ 2 │
* ├╌╌╌╌╌┼╌╌╌╌╌┤
* │ 3 ┆ 2 │
* ├╌╌╌╌╌┼╌╌╌╌╌┤
* │ 4 ┆ 2 │
* └─────┴─────┘
* ```
*/
interface readCSVStream {
(stream: Readable, options?: ReadCsvOptions): Promise<DataFrame>;
}
/**
* __Read a newline delimited JSON stream into a DataFrame.__
*
* @param stream - readable stream containing json data
* @param options
* @param options.inferSchemaLength -Maximum number of lines to read to infer schema. If set to 0, all columns will be read as pl.Utf8.
* If set to `null`, a full table scan will be done (slow).
* Note: this is done per batch
* @param options.batchSize - Number of lines to read into the buffer at once. Modify this to change performance.
* @example
* ```
* >>> const readStream = new Stream.Readable({read(){}});
* >>> readStream.push(`${JSON.stringify({a: 1, b: 2})} \n`);
* >>> readStream.push(`${JSON.stringify({a: 2, b: 2})} \n`);
* >>> readStream.push(`${JSON.stringify({a: 3, b: 2})} \n`);
* >>> readStream.push(`${JSON.stringify({a: 4, b: 2})} \n`);
* >>> readStream.push(null);
*
* >>> pl.readJSONStream(readStream).then(df => console.log(df));
* shape: (4, 2)
* ┌─────┬─────┐
* │ a ┆ b │
* │ --- ┆ --- │
* │ i64 ┆ i64 │
* ╞═════╪═════╡
* │ 1 ┆ 2 │
* ├╌╌╌╌╌┼╌╌╌╌╌┤
* │ 2 ┆ 2 │
* ├╌╌╌╌╌┼╌╌╌╌╌┤
* │ 3 ┆ 2 │
* ├╌╌╌╌╌┼╌╌╌╌╌┤
* │ 4 ┆ 2 │
* └─────┴─────┘
* ```
*/
interface readJSONStream {
(stream: Readable, options?: ReadJsonOptions): Promise<DataFrame>;
}
}
declare namespace io {
function readCSV(pathOrBody: any, options?: any): DataFrame;
function scanCSV(path: any, options?: any): LazyDataFrame;
function readJSON(pathOrBody: any, options?: any): DataFrame;
function readParquet(pathOrBody: any, options?: any): DataFrame;
function scanParquet(path: any, options?: any): LazyDataFrame;
function readIPC(pathOrBody: any, options?: any): DataFrame;
function scanIPC(path: any, options?: any): LazyDataFrame;
function readCSVStream(stream: any, options?: any): Promise<unknown>;
function readJSONStream(stream: any, options?: any): Promise<unknown>;
}
export = io;
/**
* __Read a CSV file or string into a Dataframe.__
* ___
* @param pathOrBody - path or buffer or string
* - path: Path to a file or a file like string. Any valid filepath can be used. Example: `file.csv`.
* - body: String or buffer to be read as a CSV
* @param options
* @param options.inferSchemaLength -Maximum number of lines to read to infer schema. If set to 0, all columns will be read as pl.Utf8.
* If set to `null`, a full table scan will be done (slow).
* @param options.batchSize - Number of lines to read into the buffer at once. Modify this to change performance.
* @param options.hasHeader - Indicate if first row of dataset is header or not. If set to False first row will be set to `column_x`,
* `x` being an enumeration over every column in the dataset.
* @param options.ignoreErrors -Try to keep reading lines if some lines yield errors.
* @param options.endRows -After n rows are read from the CSV, it stops reading.
* During multi-threaded parsing, an upper bound of `n` rows
* cannot be guaranteed.
* @param options.startRows -Start reading after `startRows` position.
* @param options.projection -Indices of columns to select. Note that column indices start at zero.
* @param options.sep -Character to use as delimiter in the file.
* @param options.columns -Columns to select.
* @param options.rechunk -Make sure that all columns are contiguous in memory by aggregating the chunks into a single array.
* @param options.encoding -Allowed encodings: `utf8`, `utf8-lossy`. Lossy means that invalid utf8 values are replaced with `�` character.
* @param options.numThreads -Number of threads to use in csv parsing. Defaults to the number of physical cpu's of your system.
* @param options.dtype -Overwrite the dtypes during inference.
* @param options.lowMemory - Reduce memory usage in expense of performance.
* @param options.commentChar - character that indicates the start of a comment line, for instance '#'.
* @param options.quotChar -character that is used for csv quoting, default = ''. Set to null to turn special handling and escaping of quotes off.
* @param options.nullValues - Values to interpret as null values. You can provide a
* - `string` -> all values encountered equal to this string will be null
* - `Array<string>` -> A null value per column.
* - `Record<string,string>` -> An object or map that maps column name to a null value string.Ex. {"column_1": 0}
* @param options.parseDates -Whether to attempt to parse dates or not
* @returns DataFrame
*/
export declare function readCSV(pathOrBody: string | Buffer, options?: Partial<ReadCsvOptions>): DataFrame;
/**
* __Lazily read from a CSV file or multiple files via glob patterns.__
*
* This allows the query optimizer to push down predicates and
* projections to the scan level, thereby potentially reducing
* memory overhead.
* ___
* @param path path to a file
* @param options.hasHeader - Indicate if first row of dataset is header or not. If set to False first row will be set to `column_x`,
* `x` being an enumeration over every column in the dataset.
* @param options.sep -Character to use as delimiter in the file.
* @param options.commentChar - character that indicates the start of a comment line, for instance '#'.
* @param options.quotChar -character that is used for csv quoting, default = ''. Set to null to turn special handling and escaping of quotes off.
* @param options.startRows -Start reading after `startRows` position.
* @param options.nullValues - Values to interpret as null values. You can provide a
* - `string` -> all values encountered equal to this string will be null
* - `Array<string>` -> A null value per column.
* - `Record<string,string>` -> An object or map that maps column name to a null value string.Ex. {"column_1": 0}
* @param options.ignoreErrors -Try to keep reading lines if some lines yield errors.
* @param options.cache Cache the result after reading.
* @param options.inferSchemaLength -Maximum number of lines to read to infer schema. If set to 0, all columns will be read as pl.Utf8.
* If set to `null`, a full table scan will be done (slow).
* @param options.batchSize - Number of lines to read into the buffer at once. Modify this to change performance.
* @param options.endRows -After n rows are read from the CSV, it stops reading.
* During multi-threaded parsing, an upper bound of `n` rows
* cannot be guaranteed.
* @param options.rechunk -Make sure that all columns are contiguous in memory by aggregating the chunks into a single array.
* @param options.lowMemory - Reduce memory usage in expense of performance.
* ___
*
*/
export declare function scanCSV(path: string, options?: Partial<ReadCsvOptions>): LazyDataFrame;
/**
* __Read a JSON file or string into a DataFrame.__
*
* _Note: Currently only newline delimited JSON is supported_
* @param pathOrBody - path or buffer or string
* - path: Path to a file or a file like string. Any valid filepath can be used. Example: `file.csv`.
* - body: String or buffer to be read as a CSV
* @param options
* @param options.inferSchemaLength -Maximum number of lines to read to infer schema. If set to 0, all columns will be read as pl.Utf8.
* If set to `null`, a full table scan will be done (slow).
* @param options.batchSize - Number of lines to read into the buffer at once. Modify this to change performance.
* @returns ({@link DataFrame})
* @example
* ```
* const jsonString = `
* {"a", 1, "b", "foo", "c": 3}
* {"a": 2, "b": "bar", "c": 6}
* `
* > const df = pl.readJSON(jsonString)
* > console.log(df)
* shape: (2, 3)
* ╭─────┬─────┬─────╮
* │ a ┆ b ┆ c │
* │ --- ┆ --- ┆ --- │
* │ i64 ┆ str ┆ i64 │
* ╞═════╪═════╪═════╡
* │ 1 ┆ foo ┆ 3 │
* ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
* │ 2 ┆ bar ┆ 6 │
* ╰─────┴─────┴─────╯
* ```
*/
export declare function readJSON(pathOrBody: string | Buffer, options?: Partial<ReadJsonOptions>): DataFrame;
/**
* Read into a DataFrame from a parquet file.
* @param pathOrBuffer
* Path to a file, list of files, or a file like object. If the path is a directory, that directory will be used
* as partition aware scan.
* @param options.columns Columns to select. Accepts a list of column names.
* @param options.numRows Stop reading from parquet file after reading ``n_rows``.
* @param options.parallel Read the parquet file in parallel. The single threaded reader consumes less memory.
*/
export declare function readParquet(pathOrBody: string | Buffer, options?: ReadParquetOptions): DataFrame;
/**
* __Lazily read from a parquet file or multiple files via glob patterns.__
* ___
* This allows the query optimizer to push down predicates and projections to the scan level,
* thereby potentially reducing memory overhead.
* @param path Path to a file or or glob pattern
* @param options.numRows Stop reading from parquet file after reading ``n_rows``.
* @param options.cache Cache the result after reading.
* @param options.parallel Read the parquet file in parallel. The single threaded reader consumes less memory.
* @param options.rechunk In case of reading multiple files via a glob pattern rechunk the final DataFrame into contiguous memory chunks.
*/
export declare function scanParquet(path: string, options?: ScanParquetOptions): LazyDataFrame;
/**
* __Read into a DataFrame from Arrow IPC (Feather v2) file.__
* ___
* @param pathOrBody - path or buffer or string
* - path: Path to a file or a file like string. Any valid filepath can be used. Example: `file.ipc`.
* - body: String or buffer to be read as Arrow IPC
* @param options.columns Columns to select. Accepts a list of column names.
* @param options.numRows Stop reading from parquet file after reading ``n_rows``.
*/
export declare function readIPC(pathOrBody: string | Buffer, options?: ReadIPCOptions): DataFrame;
/**
* __Lazily read from an Arrow IPC (Feather v2) file or multiple files via glob patterns.__
* ___
* @param path Path to a IPC file.
* @param options.numRows Stop reading from IPC file after reading ``numRows``
* @param options.cache Cache the result after reading.
* @param options.rechunk Reallocate to contiguous memory when all chunks/ files are parsed.
*/
export declare function scanIPC(path: string, options?: ScanIPCOptions): LazyDataFrame;
/**
* __Read a stream into a Dataframe.__
*
* **Warning:** this is much slower than `scanCSV` or `readCSV`
*
* This will consume the entire stream into a single buffer and then call `readCSV`
* Only use it when you must consume from a stream, or when performance is not a major consideration
*
* ___
* @param stream - readable stream containing csv data
* @param options
* @param options.inferSchemaLength -Maximum number of lines to read to infer schema. If set to 0, all columns will be read as pl.Utf8.
* If set to `null`, a full table scan will be done (slow).
* @param options.batchSize - Number of lines to read into the buffer at once. Modify this to change performance.
* @param options.hasHeader - Indicate if first row of dataset is header or not. If set to False first row will be set to `column_x`,
* `x` being an enumeration over every column in the dataset.
* @param options.ignoreErrors -Try to keep reading lines if some lines yield errors.
* @param options.endRows -After n rows are read from the CSV, it stops reading.
* During multi-threaded parsing, an upper bound of `n` rows
* cannot be guaranteed.
* @param options.startRows -Start reading after `startRows` position.
* @param options.projection -Indices of columns to select. Note that column indices start at zero.
* @param options.sep -Character to use as delimiter in the file.
* @param options.columns -Columns to select.
* @param options.rechunk -Make sure that all columns are contiguous in memory by aggregating the chunks into a single array.
* @param options.encoding -Allowed encodings: `utf8`, `utf8-lossy`. Lossy means that invalid utf8 values are replaced with `�` character.
* @param options.numThreads -Number of threads to use in csv parsing. Defaults to the number of physical cpu's of your system.
* @param options.dtype -Overwrite the dtypes during inference.
* @param options.lowMemory - Reduce memory usage in expense of performance.
* @param options.commentChar - character that indicates the start of a comment line, for instance '#'.
* @param options.quotChar -character that is used for csv quoting, default = ''. Set to null to turn special handling and escaping of quotes off.
* @param options.nullValues - Values to interpret as null values. You can provide a
* - `string` -> all values encountered equal to this string will be null
* - `Array<string>` -> A null value per column.
* - `Record<string,string>` -> An object or map that maps column name to a null value string.Ex. {"column_1": 0}
* @param options.parseDates -Whether to attempt to parse dates or not
* @returns Promise<DataFrame>
*
* @example
* ```
* >>> const readStream = new Stream.Readable({read(){}});
* >>> readStream.push(`a,b\n`);
* >>> readStream.push(`1,2\n`);
* >>> readStream.push(`2,2\n`);
* >>> readStream.push(`3,2\n`);
* >>> readStream.push(`4,2\n`);
* >>> readStream.push(null);
*
* >>> pl.readCSVStream(readStream).then(df => console.log(df));
* shape: (4, 2)
* ┌─────┬─────┐
* │ a ┆ b │
* │ --- ┆ --- │
* │ i64 ┆ i64 │
* ╞═════╪═════╡
* │ 1 ┆ 2 │
* ├╌╌╌╌╌┼╌╌╌╌╌┤
* │ 2 ┆ 2 │
* ├╌╌╌╌╌┼╌╌╌╌╌┤
* │ 3 ┆ 2 │
* ├╌╌╌╌╌┼╌╌╌╌╌┤
* │ 4 ┆ 2 │
* └─────┴─────┘
* ```
*/
export declare function readCSVStream(stream: Readable, options?: ReadCsvOptions): Promise<DataFrame>;
/**
* __Read a newline delimited JSON stream into a DataFrame.__
*
* @param stream - readable stream containing json data
* @param options
* @param options.inferSchemaLength -Maximum number of lines to read to infer schema. If set to 0, all columns will be read as pl.Utf8.
* If set to `null`, a full table scan will be done (slow).
* Note: this is done per batch
* @param options.batchSize - Number of lines to read into the buffer at once. Modify this to change performance.
* @example
* ```
* >>> const readStream = new Stream.Readable({read(){}});
* >>> readStream.push(`${JSON.stringify({a: 1, b: 2})} \n`);
* >>> readStream.push(`${JSON.stringify({a: 2, b: 2})} \n`);
* >>> readStream.push(`${JSON.stringify({a: 3, b: 2})} \n`);
* >>> readStream.push(`${JSON.stringify({a: 4, b: 2})} \n`);
* >>> readStream.push(null);
*
* >>> pl.readJSONStream(readStream).then(df => console.log(df));
* shape: (4, 2)
* ┌─────┬─────┐
* │ a ┆ b │
* │ --- ┆ --- │
* │ i64 ┆ i64 │
* ╞═════╪═════╡
* │ 1 ┆ 2 │
* ├╌╌╌╌╌┼╌╌╌╌╌┤
* │ 2 ┆ 2 │
* ├╌╌╌╌╌┼╌╌╌╌╌┤
* │ 3 ┆ 2 │
* ├╌╌╌╌╌┼╌╌╌╌╌┤
* │ 4 ┆ 2 │
* └─────┴─────┘
* ```
*/
export declare function readJSONStream(stream: Readable, options?: ReadJsonOptions): Promise<DataFrame>;
export {};

@@ -17,2 +17,4 @@ "use strict";

var _LineBatcher_lines, _LineBatcher_accumulatedLines, _LineBatcher_batchSize;
Object.defineProperty(exports, "__esModule", { value: true });
exports.readJSONStream = exports.readCSVStream = exports.scanIPC = exports.readIPC = exports.scanParquet = exports.readParquet = exports.readJSON = exports.scanCSV = exports.readCSV = void 0;
const polars_internal_1 = __importDefault(require("./internals/polars_internal"));

@@ -40,149 +42,2 @@ const dataframe_1 = require("./dataframe");

};
// Implementation
var io;
(function (io) {
function readCSV(pathOrBody, options) {
const extensions = [".tsv", ".csv"];
if (Buffer.isBuffer(pathOrBody)) {
return readCSVBuffer(pathOrBody, options);
}
if (typeof pathOrBody === "string") {
const inline = !(0, utils_1.isPath)(pathOrBody, extensions);
if (inline) {
return readCSVBuffer(Buffer.from(pathOrBody, "utf-8"), options);
}
else {
return readCSVPath(pathOrBody, options);
}
}
else {
throw new Error("must supply either a path or body");
}
}
io.readCSV = readCSV;
function scanCSV(path, options) {
options = { ...readCsvDefaultOptions, ...options };
return (0, dataframe_2.LazyDataFrame)(polars_internal_1.default.ldf.scanCSV({ path, ...options }));
}
io.scanCSV = scanCSV;
function readJSON(pathOrBody, options) {
const extensions = [".ndjson", ".json", ".jsonl"];
if (Buffer.isBuffer(pathOrBody)) {
return readJSONBuffer(pathOrBody, options);
}
if (typeof pathOrBody === "string") {
const inline = !(0, utils_1.isPath)(pathOrBody, extensions);
if (inline) {
return readJSONBuffer(Buffer.from(pathOrBody, "utf-8"), options);
}
else {
return readJSONPath(pathOrBody, options);
}
}
else {
throw new Error("must supply either a path or body");
}
}
io.readJSON = readJSON;
function readParquet(pathOrBody, options) {
if (Buffer.isBuffer(pathOrBody)) {
return readParquetBuffer(pathOrBody, options);
}
if (typeof pathOrBody === "string") {
const inline = !(0, utils_1.isPath)(pathOrBody, [".parquet"]);
if (inline) {
return readParquetBuffer(Buffer.from(pathOrBody, "utf-8"), options);
}
else {
return readParquetPath(pathOrBody, options);
}
}
else {
throw new Error("must supply either a path or body");
}
}
io.readParquet = readParquet;
function scanParquet(path, options) {
return (0, dataframe_2.LazyDataFrame)(polars_internal_1.default.ldf.scanParquet({ path, ...options }));
}
io.scanParquet = scanParquet;
function readIPC(pathOrBody, options) {
if (Buffer.isBuffer(pathOrBody)) {
return readIPCBuffer(pathOrBody, options);
}
if (typeof pathOrBody === "string") {
const inline = !(0, utils_1.isPath)(pathOrBody, [".ipc"]);
if (inline) {
return readIPCBuffer(Buffer.from(pathOrBody, "utf-8"), options);
}
else {
return readIPCPath(pathOrBody, options);
}
}
else {
throw new Error("must supply either a path or body");
}
}
io.readIPC = readIPC;
function scanIPC(path, options) {
return (0, dataframe_2.LazyDataFrame)(polars_internal_1.default.ldf.scanIPC({ path, ...options }));
}
io.scanIPC = scanIPC;
function readCSVStream(stream, options) {
let batchSize = options?.batchSize ?? 10000;
let count = 0;
let end = options?.endRows ?? Number.POSITIVE_INFINITY;
return new Promise((resolve, reject) => {
const s = stream.pipe(new LineBatcher({ batchSize }));
const chunks = [];
s.on("data", (chunk) => {
// early abort if 'end rows' is specified
if (count <= end) {
chunks.push(chunk);
}
else {
s.end();
}
count += batchSize;
}).on("end", () => {
try {
let buff = Buffer.concat(chunks);
const df = readCSVBuffer(buff, options);
resolve(df);
}
catch (err) {
reject(err);
}
});
});
}
io.readCSVStream = readCSVStream;
function readJSONStream(stream, options) {
let batchSize = options?.batchSize ?? 10000;
return new Promise((resolve, reject) => {
const chunks = [];
stream
.pipe(new LineBatcher({ batchSize }))
.on("data", (chunk) => {
try {
const df = readJSONBuffer(chunk, options);
chunks.push(df);
}
catch (err) {
reject(err);
}
})
.on("end", () => {
try {
const df = (0, functions_1.concat)(chunks);
resolve(df);
}
catch (err) {
reject(err);
}
});
});
}
io.readJSONStream = readJSONStream;
})(io || (io = {}));
// utility to read streams as lines.

@@ -252,2 +107,144 @@ class LineBatcher extends stream_1.Stream.Transform {

}
module.exports = io;
function readCSV(pathOrBody, options) {
const extensions = [".tsv", ".csv"];
if (Buffer.isBuffer(pathOrBody)) {
return readCSVBuffer(pathOrBody, options);
}
if (typeof pathOrBody === "string") {
const inline = !(0, utils_1.isPath)(pathOrBody, extensions);
if (inline) {
return readCSVBuffer(Buffer.from(pathOrBody, "utf-8"), options);
}
else {
return readCSVPath(pathOrBody, options);
}
}
else {
throw new Error("must supply either a path or body");
}
}
exports.readCSV = readCSV;
function scanCSV(path, options) {
options = { ...readCsvDefaultOptions, ...options };
return (0, dataframe_2.LazyDataFrame)(polars_internal_1.default.ldf.scanCSV({ path, ...options }));
}
exports.scanCSV = scanCSV;
function readJSON(pathOrBody, options) {
const extensions = [".ndjson", ".json", ".jsonl"];
if (Buffer.isBuffer(pathOrBody)) {
return readJSONBuffer(pathOrBody, options);
}
if (typeof pathOrBody === "string") {
const inline = !(0, utils_1.isPath)(pathOrBody, extensions);
if (inline) {
return readJSONBuffer(Buffer.from(pathOrBody, "utf-8"), options);
}
else {
return readJSONPath(pathOrBody, options);
}
}
else {
throw new Error("must supply either a path or body");
}
}
exports.readJSON = readJSON;
function readParquet(pathOrBody, options) {
if (Buffer.isBuffer(pathOrBody)) {
return readParquetBuffer(pathOrBody, options);
}
if (typeof pathOrBody === "string") {
const inline = !(0, utils_1.isPath)(pathOrBody, [".parquet"]);
if (inline) {
return readParquetBuffer(Buffer.from(pathOrBody, "utf-8"), options);
}
else {
return readParquetPath(pathOrBody, options);
}
}
else {
throw new Error("must supply either a path or body");
}
}
exports.readParquet = readParquet;
function scanParquet(path, options) {
return (0, dataframe_2.LazyDataFrame)(polars_internal_1.default.ldf.scanParquet({ path, ...options }));
}
exports.scanParquet = scanParquet;
function readIPC(pathOrBody, options) {
if (Buffer.isBuffer(pathOrBody)) {
return readIPCBuffer(pathOrBody, options);
}
if (typeof pathOrBody === "string") {
const inline = !(0, utils_1.isPath)(pathOrBody, [".ipc"]);
if (inline) {
return readIPCBuffer(Buffer.from(pathOrBody, "utf-8"), options);
}
else {
return readIPCPath(pathOrBody, options);
}
}
else {
throw new Error("must supply either a path or body");
}
}
exports.readIPC = readIPC;
function scanIPC(path, options) {
return (0, dataframe_2.LazyDataFrame)(polars_internal_1.default.ldf.scanIPC({ path, ...options }));
}
exports.scanIPC = scanIPC;
function readCSVStream(stream, options) {
let batchSize = options?.batchSize ?? 10000;
let count = 0;
let end = options?.endRows ?? Number.POSITIVE_INFINITY;
return new Promise((resolve, reject) => {
const s = stream.pipe(new LineBatcher({ batchSize }));
const chunks = [];
s.on("data", (chunk) => {
// early abort if 'end rows' is specified
if (count <= end) {
chunks.push(chunk);
}
else {
s.end();
}
count += batchSize;
}).on("end", () => {
try {
let buff = Buffer.concat(chunks);
const df = readCSVBuffer(buff, options);
resolve(df);
}
catch (err) {
reject(err);
}
});
});
}
exports.readCSVStream = readCSVStream;
function readJSONStream(stream, options) {
let batchSize = options?.batchSize ?? 10000;
return new Promise((resolve, reject) => {
const chunks = [];
stream
.pipe(new LineBatcher({ batchSize }))
.on("data", (chunk) => {
try {
const df = readJSONBuffer(chunk, options);
chunks.push(df);
}
catch (err) {
reject(err);
}
})
.on("end", () => {
try {
const df = (0, functions_1.concat)(chunks);
resolve(df);
}
catch (err) {
reject(err);
}
});
});
}
exports.readJSONStream = readJSONStream;

@@ -154,2 +154,4 @@ import { DataType } from "../datatypes";

* @param n The number of values to extend.
* @deprecated
* @see {@link extendConstant}
*/

@@ -161,2 +163,12 @@ extend(value: any, n: number): Expr;

}): Expr;
/**
* Extend the Series with given number of values.
* @param value The value to extend the Series with. This value may be null to fill with nulls.
* @param n The number of values to extend.
*/
extendConstant(value: any, n: number): Expr;
extendConstant(opt: {
value: any;
n: number;
}): Expr;
/** Fill nan value with a fill value */

@@ -163,0 +175,0 @@ fillNan(other: any): Expr;

@@ -127,6 +127,12 @@ "use strict";

if (n !== null && typeof n === "number") {
return wrap("extend", { value: o, n });
return wrap("extendConstant", { value: o, n });
}
return wrap("extend", o);
return wrap("extendConstant", o);
},
extendConstant(o, n) {
if (n !== null && typeof n === "number") {
return wrap("extendConstant", { value: o, n });
}
return wrap("extendConstant", o);
},
fillNan: wrapExprArg("fillNan", true),

@@ -133,0 +139,0 @@ fillNull(fillValue) {

@@ -15,2 +15,9 @@ import { Expr } from "../expr";

first(): Expr;
/**
* Join all string items in a sublist and place a separator between them.
* This errors if inner type of list `!= Utf8`.
* @param separator A string used to separate one element of the list from the next in the resulting string.
* If omitted, the list elements are separated with a comma.
*/
join(separator?: string): Expr;
/** Get the last value of the sublists. */

@@ -17,0 +24,0 @@ last(): Expr;

@@ -20,2 +20,5 @@ "use strict";

},
join(separator = ",") {
return wrap("join", { separator });
},
last() {

@@ -22,0 +25,0 @@ return wrap("get", { index: -1 });

@@ -162,6 +162,14 @@ import { DataType } from "../../datatypes";

/**
* Parse a Series of dtype Utf8 to a Date/Datetime Series.
* @param datatype Date or Datetime.
* @param fmt formatting syntax. [Read more](https://docs.rs/chrono/0.4.19/chrono/format/strftime/index.html)
*/
* Split a string into substrings using the specified separator and return them as a Series.
* @param separator — A string that identifies character or characters to use in separating the string.
* @param inclusive Include the split character/string in the results
*/
split(by: string, options?: {
inclusive?: boolean;
} | boolean): Expr;
/**
* Parse a Series of dtype Utf8 to a Date/Datetime Series.
* @param datatype Date or Datetime.
* @param fmt formatting syntax. [Read more](https://docs.rs/chrono/0.4.19/chrono/format/strftime/index.html)
*/
strftime(datatype: DataType.Date, fmt?: string): Expr;

@@ -168,0 +176,0 @@ strftime(datatype: DataType.Datetime, fmt?: string): Expr;

@@ -72,2 +72,6 @@ "use strict";

},
split(by, options) {
const inclusive = typeof options === "boolean" ? options : options?.inclusive;
return wrap("split", { by, inclusive });
},
strftime(dtype, fmt) {

@@ -74,0 +78,0 @@ if (dtype === datatypes_1.DataType.Date) {

{
"name": "nodejs-polars",
"version": "0.2.1",
"version": "0.3.0",
"repository": "https://github.com/pola-rs/polars.git",

@@ -5,0 +5,0 @@ "license": "SEE LICENSE IN LICENSE",

@@ -5,2 +5,9 @@ import { JsSeries, Series } from "./series";

first(): Series<T>;
/**
* Join all string items in a sublist and place a separator between them.
* This errors if inner type of list `!= Utf8`.
* @param separator A string used to separate one element of the list from the next in the resulting string.
* If omitted, the list elements are separated with a comma.
*/
join(separator?: string): Series<string>;
last(): Series<T>;

@@ -7,0 +14,0 @@ /** Get the length of the arrays as UInt32. */

@@ -19,2 +19,3 @@ "use strict";

first: callExpr("first"),
join: callExpr("join"),
last: callExpr("last"),

@@ -21,0 +22,0 @@ lengths: callExpr("lengths"),

@@ -210,6 +210,8 @@ import { DataType, DtypeToPrimitive, Optional } from "../datatypes";

/**
* Extend the Series with given number of values.
* @param value The value to extend the Series with. This value may be null to fill with nulls.
* @param n The number of values to extend.
*/
* Extend the Series with given number of values.
* @param value The value to extend the Series with. This value may be null to fill with nulls.
* @param n The number of values to extend.
* @deprecated
* @see {@link extendConstant}
*/
extend(value: any, n: number): Series<T>;

@@ -221,2 +223,12 @@ extend(opt: {

/**
* Extend the Series with given number of values.
* @param value The value to extend the Series with. This value may be null to fill with nulls.
* @param n The number of values to extend.
*/
extendConstant(value: any, n: number): Series<T>;
extendConstant(opt: {
value: any;
n: number;
}): Series<T>;
/**
* __Fill null values with a filling strategy.__

@@ -223,0 +235,0 @@ * ___

@@ -241,6 +241,9 @@ "use strict";

extend(o, n) {
return this.extendConstant(o, n);
},
extendConstant(o, n) {
if (n !== null && typeof n === "number") {
return wrap("extend", { value: o, n });
return wrap("extend_constant", { value: o, n });
}
return wrap("extend", o);
return wrap("extend_constant", o);
},

@@ -247,0 +250,0 @@ fillNull(strategy) {

@@ -153,2 +153,11 @@ import { DataType } from "../datatypes";

/**
* Split a string into substrings using the specified separator.
* The return type will by of type List<Utf8>
* @param separator — A string that identifies character or characters to use in separating the string.
* @param inclusive Include the split character/string in the results
*/
split(separator: string, options?: {
inclusive?: boolean;
} | boolean): Series<Series<string>>;
/**
* Parse a Series of dtype Utf8 to a Date/Datetime Series.

@@ -155,0 +164,0 @@ * @param datatype Date or Datetime.

@@ -80,2 +80,13 @@ "use strict";

},
split(by, options) {
const inclusive = typeof options === "boolean" ? options : options?.inclusive;
const s = (0, series_1.seriesWrapper)(_s);
return s
.toFrame()
.select((0, functions_1.col)(s.name)
.str
.split(by, inclusive)
.as(s.name))
.getColumn(s.name);
},
strftime(dtype, fmt) {

@@ -82,0 +93,0 @@ if (dtype === datatypes_1.DataType.Date) {

{
"name": "nodejs-polars",
"version": "0.2.1",
"version": "0.3.0",
"repository": "https://github.com/pola-rs/polars.git",

@@ -89,14 +89,14 @@ "license": "SEE LICENSE IN LICENSE",

"optionalDependencies": {
"nodejs-polars-win32-x64-msvc": "0.2.1",
"nodejs-polars-darwin-x64": "0.2.1",
"nodejs-polars-linux-x64-gnu": "0.2.1",
"nodejs-polars-win32-ia32-msvc": "0.2.1",
"nodejs-polars-linux-arm64-gnu": "0.2.1",
"nodejs-polars-linux-arm-gnueabihf": "0.2.1",
"nodejs-polars-darwin-arm64": "0.2.1",
"nodejs-polars-android-arm64": "0.2.1",
"nodejs-polars-linux-x64-musl": "0.2.1",
"nodejs-polars-linux-arm64-musl": "0.2.1",
"nodejs-polars-win32-arm64-msvc": "0.2.1"
"nodejs-polars-win32-x64-msvc": "0.3.0",
"nodejs-polars-darwin-x64": "0.3.0",
"nodejs-polars-linux-x64-gnu": "0.3.0",
"nodejs-polars-win32-ia32-msvc": "0.3.0",
"nodejs-polars-linux-arm64-gnu": "0.3.0",
"nodejs-polars-linux-arm-gnueabihf": "0.3.0",
"nodejs-polars-darwin-arm64": "0.3.0",
"nodejs-polars-android-arm64": "0.3.0",
"nodejs-polars-linux-x64-musl": "0.3.0",
"nodejs-polars-linux-arm64-musl": "0.3.0",
"nodejs-polars-win32-arm64-msvc": "0.3.0"
}
}
SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc