Source code for dask.datasets

from __future__ import annotations

import random

from packaging.version import Version

from dask.utils import import_required



[docs]
def timeseries(
    start="2000-01-01",
    end="2000-01-31",
    freq="1s",
    partition_freq="1D",
    dtypes=None,
    seed=None,
    **kwargs,
):
    """Create timeseries dataframe with random data

    Parameters
    ----------
    start : datetime (or datetime-like string)
        Start of time series
    end : datetime (or datetime-like string)
        End of time series
    dtypes : dict (optional)
        Mapping of column names to types.
        Valid types include {float, int, str, 'category'}
    freq : string
        String like '2s' or '1H' or '12W' for the time series frequency
    partition_freq : string
        String like '1M' or '2Y' to divide the dataframe into partitions
    seed : int (optional)
        Randomstate seed
    kwargs:
        Keywords to pass down to individual column creation functions.
        Keywords should be prefixed by the column name and then an underscore.

    Examples
    --------
    >>> import dask
    >>> df = dask.datasets.timeseries()
    >>> df.head()  # doctest: +SKIP
              timestamp    id     name         x         y
    2000-01-01 00:00:00   967    Jerry -0.031348 -0.040633
    2000-01-01 00:00:01  1066  Michael -0.262136  0.307107
    2000-01-01 00:00:02   988    Wendy -0.526331  0.128641
    2000-01-01 00:00:03  1016   Yvonne  0.620456  0.767270
    2000-01-01 00:00:04   998   Ursula  0.684902 -0.463278
    >>> df = dask.datasets.timeseries(
    ...     '2000', '2010',
    ...     freq='2h', partition_freq='1D', seed=1,  # data frequency
    ...     dtypes={'value': float, 'name': str, 'id': int},  # data types
    ...     id_lam=1000  # control number of items in id column
    ... )
    """
    from dask.dataframe.dask_expr.datasets import timeseries

    if dtypes is None:
        dtypes = {"name": str, "id": int, "x": float, "y": float}

    return timeseries(
        start=start,
        end=end,
        freq=freq,
        partition_freq=partition_freq,
        seed=seed,
        dtypes=dtypes,
        **kwargs,
    )



def _generate_mimesis(field, schema_description, records_per_partition, seed):
    """Generate data for a single partition of a dask bag

    See Also
    --------
    _make_mimesis
    """
    import mimesis
    from mimesis.schema import Field, Schema

    field = Field(seed=seed, **field)
    # `iterations=` kwarg moved from `Schema.create()` to `Schema.__init__()`
    # starting with `mimesis=9`.
    schema_kwargs, create_kwargs = {}, {}
    if Version(mimesis.__version__) < Version("9.0.0"):
        create_kwargs["iterations"] = 1
    else:
        schema_kwargs["iterations"] = 1
    schema = Schema(schema=lambda: schema_description(field), **schema_kwargs)
    return [schema.create(**create_kwargs)[0] for i in range(records_per_partition)]


def _make_mimesis(field, schema, npartitions, records_per_partition, seed=None):
    """
    Make a Dask Bag filled with data randomly generated by the mimesis project

    Parameters
    ----------
    field: dict
        keyword arguments to pass to ``mimesis.Field``
    schema: Callable[Field] -> dict
        The schema to use to generate the data
    npartitions: int
    records_per_partition: int
    seed: int, None
        Seed for random data

    Returns
    -------
    Dask Bag

    See Also
    --------
    make_people
    """
    import dask.bag as db
    from dask.base import tokenize

    field = field or {}

    random_state = random.Random(seed)
    seeds = [random_state.randint(0, 1 << 32) for _ in range(npartitions)]

    name = "mimesis-" + tokenize(
        field, schema, npartitions, records_per_partition, seed
    )
    dsk = {
        (name, i): (_generate_mimesis, field, schema, records_per_partition, seed)
        for i, seed in enumerate(seeds)
    }

    return db.Bag(dsk, name, npartitions)



[docs]
def make_people(npartitions=10, records_per_partition=1000, seed=None, locale="en"):
    """Make a dataset of random people

    This makes a Dask Bag with dictionary records of randomly generated people.
    This requires the optional library ``mimesis`` to generate records.

    Parameters
    ----------
    npartitions : int
        Number of partitions
    records_per_partition : int
        Number of records in each partition
    seed : int, (optional)
        Random seed
    locale : str
        Language locale, like 'en', 'fr', 'zh', or 'ru'

    Returns
    -------
    b: Dask Bag
    """
    import_required(
        "mimesis",
        "The mimesis module is required for this function.  Try:\n"
        "  python -m pip install mimesis",
    )

    schema = lambda field: {
        "age": field("random.randint", a=0, b=120),
        "name": (field("person.name"), field("person.surname")),
        "occupation": field("person.occupation"),
        "telephone": field("person.telephone"),
        "address": {"address": field("address.address"), "city": field("address.city")},
        "credit-card": {
            "number": field("payment.credit_card_number"),
            "expiration-date": field("payment.credit_card_expiration_date"),
        },
    }

    return _make_mimesis(
        {"locale": locale}, schema, npartitions, records_per_partition, seed
    )