Source code for dask_expr.io.orc

from dask_expr import from_legacy_dataframe
from dask_expr._backends import dataframe_creation_dispatch


[docs]@dataframe_creation_dispatch.register_inplace("pandas")
def read_orc(
    path,
    engine="pyarrow",
    columns=None,
    index=None,
    split_stripes=1,
    aggregate_files=None,
    storage_options=None,
):
    """Read dataframe from ORC file(s)

    Parameters
    ----------
    path: str or list(str)
        Location of file(s), which can be a full URL with protocol
        specifier, and may include glob character if a single string.
    engine: 'pyarrow' or ORCEngine
        Backend ORC engine to use for I/O. Default is "pyarrow".
    columns: None or list(str)
        Columns to load. If None, loads all.
    index: str
        Column name to set as index.
    split_stripes: int or False
        Maximum number of ORC stripes to include in each output-DataFrame
        partition. Use False to specify a 1-to-1 mapping between files
        and partitions. Default is 1.
    aggregate_files : bool, default False
        Whether distinct file paths may be aggregated into the same output
        partition. A setting of True means that any two file paths may be
        aggregated into the same output partition, while False means that
        inter-file aggregation is prohibited.
    storage_options: None or dict
        Further parameters to pass to the bytes backend.

    Returns
    -------
    Dask.DataFrame (even if there is only one column)

    Examples
    --------
    >>> df = dd.read_orc('https://github.com/apache/orc/raw/'
    ...                  'master/examples/demo-11-zlib.orc')  # doctest: +SKIP
    """
    from dask.dataframe.io import read_orc as _read_orc

    df = _read_orc(
        path,
        engine=engine,
        columns=columns,
        index=index,
        split_stripes=split_stripes,
        aggregate_files=aggregate_files,
        storage_options=storage_options,
    )
    return from_legacy_dataframe(df)


def to_orc(
    df,
    path,
    engine="pyarrow",
    write_index=True,
    storage_options=None,
    compute=True,
    compute_kwargs=None,
):
    from dask.dataframe.io import to_orc as _to_orc

    return _to_orc(
        df.to_legacy_dataframe(),
        path,
        engine=engine,
        write_index=write_index,
        storage_options=storage_options,
        compute=compute,
        compute_kwargs=compute_kwargs,
    )