Source code for dask.datasets

import random

from .utils import import_required


[docs]def timeseries( start="2000-01-01", end="2000-01-31", freq="1s", partition_freq="1d", dtypes={"name": str, "id": int, "x": float, "y": float}, seed=None, **kwargs ): """Create timeseries dataframe with random data Parameters ---------- start : datetime (or datetime-like string) Start of time series end : datetime (or datetime-like string) End of time series dtypes : dict Mapping of column names to types. Valid types include {float, int, str, 'category'} freq : string String like '2s' or '1H' or '12W' for the time series frequency partition_freq : string String like '1M' or '2Y' to divide the dataframe into partitions seed : int (optional) Randomstate seed kwargs: Keywords to pass down to individual column creation functions. Keywords should be prefixed by the column name and then an underscore. Examples -------- >>> import dask >>> df = dask.datasets.timeseries() >>> df.head() # doctest: +SKIP timestamp id name x y 2000-01-01 00:00:00 967 Jerry -0.031348 -0.040633 2000-01-01 00:00:01 1066 Michael -0.262136 0.307107 2000-01-01 00:00:02 988 Wendy -0.526331 0.128641 2000-01-01 00:00:03 1016 Yvonne 0.620456 0.767270 2000-01-01 00:00:04 998 Ursula 0.684902 -0.463278 >>> df = dask.datasets.timeseries( ... '2000', '2010', ... freq='2H', partition_freq='1D', seed=1, # data frequency ... dtypes={'value': float, 'name': str, 'id': int}, # data types ... id_lam=1000 # control number of items in id column ... ) """ from dask.dataframe.io.demo import make_timeseries return make_timeseries( start=start, end=end, freq=freq, partition_freq=partition_freq, seed=seed, dtypes=dtypes, **kwargs )
def _generate_mimesis(field, schema_description, records_per_partition, seed): """Generate data for a single partition of a dask bag See Also -------- _make_mimesis """ from mimesis.schema import Schema, Field field = Field(seed=seed, **field) schema = Schema(schema=lambda: schema_description(field)) for i in range(records_per_partition): yield schema.create(iterations=1)[0] def _make_mimesis(field, schema, npartitions, records_per_partition, seed=None): """ Make a Dask Bag filled with data randomly generated by the mimesis projet Parameters ---------- field: dict keyword arguments to pass to ``mimesis.Field`` schema: Callable[Field] -> dict The schema to use to generate the data npartitions: int records_per_partition: int seed: int, None Seed for random data Returns ------- Dask Bag See Also -------- make_people """ import dask.bag as db from dask.base import tokenize field = field or {} random_state = random.Random(seed) seeds = [random_state.randint(0, 1 << 32) for _ in range(npartitions)] name = "mimesis-" + tokenize( field, schema, npartitions, records_per_partition, seed ) dsk = { (name, i): (_generate_mimesis, field, schema, records_per_partition, seed) for i, seed in enumerate(seeds) } return db.Bag(dsk, name, npartitions)
[docs]def make_people(npartitions=10, records_per_partition=1000, seed=None, locale="en"): """Make a dataset of random people This makes a Dask Bag with dictionary records of randomly generated people. This requires the optional library ``mimesis`` to generate records. Parameters ---------- npartitions : int Number of partitions records_per_partition : int Number of records in each partition seed : int, (optional) Random seed locale : str Language locale, like 'en', 'fr', 'zh', or 'ru' Returns ------- b: Dask Bag """ import_required( "mimesis", "The mimesis module is required for this function. Try:\n" " python -m pip install mimesis", ) schema = lambda field: { "age": field("person.age"), "name": (field("person.name"), field("person.surname")), "occupation": field("person.occupation"), "telephone": field("person.telephone"), "address": {"address": field("address.address"), "city": field("address.city")}, "credit-card": { "number": field("payment.credit_card_number"), "expiration-date": field("payment.credit_card_expiration_date"), }, } return _make_mimesis( {"locale": locale}, schema, npartitions, records_per_partition, seed )