369 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			369 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
"""
 | 
						|
Table Schema builders
 | 
						|
 | 
						|
https://specs.frictionlessdata.io/json-table-schema/
 | 
						|
"""
 | 
						|
from __future__ import annotations
 | 
						|
 | 
						|
from typing import (
 | 
						|
    TYPE_CHECKING,
 | 
						|
    Any,
 | 
						|
    cast,
 | 
						|
)
 | 
						|
import warnings
 | 
						|
 | 
						|
import pandas._libs.json as json
 | 
						|
from pandas._typing import (
 | 
						|
    DtypeObj,
 | 
						|
    JSONSerializable,
 | 
						|
)
 | 
						|
 | 
						|
from pandas.core.dtypes.base import _registry as registry
 | 
						|
from pandas.core.dtypes.common import (
 | 
						|
    is_bool_dtype,
 | 
						|
    is_categorical_dtype,
 | 
						|
    is_datetime64_dtype,
 | 
						|
    is_datetime64tz_dtype,
 | 
						|
    is_extension_array_dtype,
 | 
						|
    is_integer_dtype,
 | 
						|
    is_numeric_dtype,
 | 
						|
    is_period_dtype,
 | 
						|
    is_string_dtype,
 | 
						|
    is_timedelta64_dtype,
 | 
						|
)
 | 
						|
from pandas.core.dtypes.dtypes import CategoricalDtype
 | 
						|
 | 
						|
from pandas import DataFrame
 | 
						|
import pandas.core.common as com
 | 
						|
 | 
						|
if TYPE_CHECKING:
 | 
						|
    from pandas import Series
 | 
						|
    from pandas.core.indexes.multi import MultiIndex
 | 
						|
 | 
						|
loads = json.loads
 | 
						|
 | 
						|
TABLE_SCHEMA_VERSION = "1.4.0"
 | 
						|
 | 
						|
 | 
						|
def as_json_table_type(x: DtypeObj) -> str:
 | 
						|
    """
 | 
						|
    Convert a NumPy / pandas type to its corresponding json_table.
 | 
						|
 | 
						|
    Parameters
 | 
						|
    ----------
 | 
						|
    x : np.dtype or ExtensionDtype
 | 
						|
 | 
						|
    Returns
 | 
						|
    -------
 | 
						|
    str
 | 
						|
        the Table Schema data types
 | 
						|
 | 
						|
    Notes
 | 
						|
    -----
 | 
						|
    This table shows the relationship between NumPy / pandas dtypes,
 | 
						|
    and Table Schema dtypes.
 | 
						|
 | 
						|
    ==============  =================
 | 
						|
    Pandas type     Table Schema type
 | 
						|
    ==============  =================
 | 
						|
    int64           integer
 | 
						|
    float64         number
 | 
						|
    bool            boolean
 | 
						|
    datetime64[ns]  datetime
 | 
						|
    timedelta64[ns] duration
 | 
						|
    object          str
 | 
						|
    categorical     any
 | 
						|
    =============== =================
 | 
						|
    """
 | 
						|
    if is_integer_dtype(x):
 | 
						|
        return "integer"
 | 
						|
    elif is_bool_dtype(x):
 | 
						|
        return "boolean"
 | 
						|
    elif is_numeric_dtype(x):
 | 
						|
        return "number"
 | 
						|
    elif is_datetime64_dtype(x) or is_datetime64tz_dtype(x) or is_period_dtype(x):
 | 
						|
        return "datetime"
 | 
						|
    elif is_timedelta64_dtype(x):
 | 
						|
        return "duration"
 | 
						|
    elif is_categorical_dtype(x):
 | 
						|
        return "any"
 | 
						|
    elif is_extension_array_dtype(x):
 | 
						|
        return "any"
 | 
						|
    elif is_string_dtype(x):
 | 
						|
        return "string"
 | 
						|
    else:
 | 
						|
        return "any"
 | 
						|
 | 
						|
 | 
						|
def set_default_names(data):
 | 
						|
    """Sets index names to 'index' for regular, or 'level_x' for Multi"""
 | 
						|
    if com.all_not_none(*data.index.names):
 | 
						|
        nms = data.index.names
 | 
						|
        if len(nms) == 1 and data.index.name == "index":
 | 
						|
            warnings.warn("Index name of 'index' is not round-trippable.")
 | 
						|
        elif len(nms) > 1 and any(x.startswith("level_") for x in nms):
 | 
						|
            warnings.warn(
 | 
						|
                "Index names beginning with 'level_' are not round-trippable."
 | 
						|
            )
 | 
						|
        return data
 | 
						|
 | 
						|
    data = data.copy()
 | 
						|
    if data.index.nlevels > 1:
 | 
						|
        data.index.names = com.fill_missing_names(data.index.names)
 | 
						|
    else:
 | 
						|
        data.index.name = data.index.name or "index"
 | 
						|
    return data
 | 
						|
 | 
						|
 | 
						|
def convert_pandas_type_to_json_field(arr):
 | 
						|
    dtype = arr.dtype
 | 
						|
    if arr.name is None:
 | 
						|
        name = "values"
 | 
						|
    else:
 | 
						|
        name = arr.name
 | 
						|
    field: dict[str, JSONSerializable] = {
 | 
						|
        "name": name,
 | 
						|
        "type": as_json_table_type(dtype),
 | 
						|
    }
 | 
						|
 | 
						|
    if is_categorical_dtype(dtype):
 | 
						|
        cats = dtype.categories
 | 
						|
        ordered = dtype.ordered
 | 
						|
 | 
						|
        field["constraints"] = {"enum": list(cats)}
 | 
						|
        field["ordered"] = ordered
 | 
						|
    elif is_period_dtype(dtype):
 | 
						|
        field["freq"] = dtype.freq.freqstr
 | 
						|
    elif is_datetime64tz_dtype(dtype):
 | 
						|
        field["tz"] = dtype.tz.zone
 | 
						|
    elif is_extension_array_dtype(dtype):
 | 
						|
        field["extDtype"] = dtype.name
 | 
						|
    return field
 | 
						|
 | 
						|
 | 
						|
def convert_json_field_to_pandas_type(field):
 | 
						|
    """
 | 
						|
    Converts a JSON field descriptor into its corresponding NumPy / pandas type
 | 
						|
 | 
						|
    Parameters
 | 
						|
    ----------
 | 
						|
    field
 | 
						|
        A JSON field descriptor
 | 
						|
 | 
						|
    Returns
 | 
						|
    -------
 | 
						|
    dtype
 | 
						|
 | 
						|
    Raises
 | 
						|
    ------
 | 
						|
    ValueError
 | 
						|
        If the type of the provided field is unknown or currently unsupported
 | 
						|
 | 
						|
    Examples
 | 
						|
    --------
 | 
						|
    >>> convert_json_field_to_pandas_type({"name": "an_int", "type": "integer"})
 | 
						|
    'int64'
 | 
						|
 | 
						|
    >>> convert_json_field_to_pandas_type(
 | 
						|
    ...     {
 | 
						|
    ...         "name": "a_categorical",
 | 
						|
    ...         "type": "any",
 | 
						|
    ...         "constraints": {"enum": ["a", "b", "c"]},
 | 
						|
    ...         "ordered": True,
 | 
						|
    ...     }
 | 
						|
    ... )
 | 
						|
    CategoricalDtype(categories=['a', 'b', 'c'], ordered=True)
 | 
						|
 | 
						|
    >>> convert_json_field_to_pandas_type({"name": "a_datetime", "type": "datetime"})
 | 
						|
    'datetime64[ns]'
 | 
						|
 | 
						|
    >>> convert_json_field_to_pandas_type(
 | 
						|
    ...     {"name": "a_datetime_with_tz", "type": "datetime", "tz": "US/Central"}
 | 
						|
    ... )
 | 
						|
    'datetime64[ns, US/Central]'
 | 
						|
    """
 | 
						|
    typ = field["type"]
 | 
						|
    if typ == "string":
 | 
						|
        return "object"
 | 
						|
    elif typ == "integer":
 | 
						|
        return "int64"
 | 
						|
    elif typ == "number":
 | 
						|
        return "float64"
 | 
						|
    elif typ == "boolean":
 | 
						|
        return "bool"
 | 
						|
    elif typ == "duration":
 | 
						|
        return "timedelta64"
 | 
						|
    elif typ == "datetime":
 | 
						|
        if field.get("tz"):
 | 
						|
            return f"datetime64[ns, {field['tz']}]"
 | 
						|
        else:
 | 
						|
            return "datetime64[ns]"
 | 
						|
    elif typ == "any":
 | 
						|
        if "constraints" in field and "ordered" in field:
 | 
						|
            return CategoricalDtype(
 | 
						|
                categories=field["constraints"]["enum"], ordered=field["ordered"]
 | 
						|
            )
 | 
						|
        elif "extDtype" in field:
 | 
						|
            return registry.find(field["extDtype"])
 | 
						|
        else:
 | 
						|
            return "object"
 | 
						|
 | 
						|
    raise ValueError(f"Unsupported or invalid field type: {typ}")
 | 
						|
 | 
						|
 | 
						|
def build_table_schema(
 | 
						|
    data: DataFrame | Series,
 | 
						|
    index: bool = True,
 | 
						|
    primary_key: bool | None = None,
 | 
						|
    version: bool = True,
 | 
						|
) -> dict[str, JSONSerializable]:
 | 
						|
    """
 | 
						|
    Create a Table schema from ``data``.
 | 
						|
 | 
						|
    Parameters
 | 
						|
    ----------
 | 
						|
    data : Series, DataFrame
 | 
						|
    index : bool, default True
 | 
						|
        Whether to include ``data.index`` in the schema.
 | 
						|
    primary_key : bool or None, default True
 | 
						|
        Column names to designate as the primary key.
 | 
						|
        The default `None` will set `'primaryKey'` to the index
 | 
						|
        level or levels if the index is unique.
 | 
						|
    version : bool, default True
 | 
						|
        Whether to include a field `pandas_version` with the version
 | 
						|
        of pandas that last revised the table schema. This version
 | 
						|
        can be different from the installed pandas version.
 | 
						|
 | 
						|
    Returns
 | 
						|
    -------
 | 
						|
    schema : dict
 | 
						|
 | 
						|
    Notes
 | 
						|
    -----
 | 
						|
    See `Table Schema
 | 
						|
    <https://pandas.pydata.org/docs/user_guide/io.html#table-schema>`__ for
 | 
						|
    conversion types.
 | 
						|
    Timedeltas as converted to ISO8601 duration format with
 | 
						|
    9 decimal places after the seconds field for nanosecond precision.
 | 
						|
 | 
						|
    Categoricals are converted to the `any` dtype, and use the `enum` field
 | 
						|
    constraint to list the allowed values. The `ordered` attribute is included
 | 
						|
    in an `ordered` field.
 | 
						|
 | 
						|
    Examples
 | 
						|
    --------
 | 
						|
    >>> df = pd.DataFrame(
 | 
						|
    ...     {'A': [1, 2, 3],
 | 
						|
    ...      'B': ['a', 'b', 'c'],
 | 
						|
    ...      'C': pd.date_range('2016-01-01', freq='d', periods=3),
 | 
						|
    ...     }, index=pd.Index(range(3), name='idx'))
 | 
						|
    >>> build_table_schema(df)
 | 
						|
    {'fields': \
 | 
						|
[{'name': 'idx', 'type': 'integer'}, \
 | 
						|
{'name': 'A', 'type': 'integer'}, \
 | 
						|
{'name': 'B', 'type': 'string'}, \
 | 
						|
{'name': 'C', 'type': 'datetime'}], \
 | 
						|
'primaryKey': ['idx'], \
 | 
						|
'pandas_version': '1.4.0'}
 | 
						|
    """
 | 
						|
    if index is True:
 | 
						|
        data = set_default_names(data)
 | 
						|
 | 
						|
    schema: dict[str, Any] = {}
 | 
						|
    fields = []
 | 
						|
 | 
						|
    if index:
 | 
						|
        if data.index.nlevels > 1:
 | 
						|
            data.index = cast("MultiIndex", data.index)
 | 
						|
            for level, name in zip(data.index.levels, data.index.names):
 | 
						|
                new_field = convert_pandas_type_to_json_field(level)
 | 
						|
                new_field["name"] = name
 | 
						|
                fields.append(new_field)
 | 
						|
        else:
 | 
						|
            fields.append(convert_pandas_type_to_json_field(data.index))
 | 
						|
 | 
						|
    if data.ndim > 1:
 | 
						|
        for column, s in data.items():
 | 
						|
            fields.append(convert_pandas_type_to_json_field(s))
 | 
						|
    else:
 | 
						|
        fields.append(convert_pandas_type_to_json_field(data))
 | 
						|
 | 
						|
    schema["fields"] = fields
 | 
						|
    if index and data.index.is_unique and primary_key is None:
 | 
						|
        if data.index.nlevels == 1:
 | 
						|
            schema["primaryKey"] = [data.index.name]
 | 
						|
        else:
 | 
						|
            schema["primaryKey"] = data.index.names
 | 
						|
    elif primary_key is not None:
 | 
						|
        schema["primaryKey"] = primary_key
 | 
						|
 | 
						|
    if version:
 | 
						|
        schema["pandas_version"] = TABLE_SCHEMA_VERSION
 | 
						|
    return schema
 | 
						|
 | 
						|
 | 
						|
def parse_table_schema(json, precise_float):
 | 
						|
    """
 | 
						|
    Builds a DataFrame from a given schema
 | 
						|
 | 
						|
    Parameters
 | 
						|
    ----------
 | 
						|
    json :
 | 
						|
        A JSON table schema
 | 
						|
    precise_float : bool
 | 
						|
        Flag controlling precision when decoding string to double values, as
 | 
						|
        dictated by ``read_json``
 | 
						|
 | 
						|
    Returns
 | 
						|
    -------
 | 
						|
    df : DataFrame
 | 
						|
 | 
						|
    Raises
 | 
						|
    ------
 | 
						|
    NotImplementedError
 | 
						|
        If the JSON table schema contains either timezone or timedelta data
 | 
						|
 | 
						|
    Notes
 | 
						|
    -----
 | 
						|
        Because :func:`DataFrame.to_json` uses the string 'index' to denote a
 | 
						|
        name-less :class:`Index`, this function sets the name of the returned
 | 
						|
        :class:`DataFrame` to ``None`` when said string is encountered with a
 | 
						|
        normal :class:`Index`. For a :class:`MultiIndex`, the same limitation
 | 
						|
        applies to any strings beginning with 'level_'. Therefore, an
 | 
						|
        :class:`Index` name of 'index'  and :class:`MultiIndex` names starting
 | 
						|
        with 'level_' are not supported.
 | 
						|
 | 
						|
    See Also
 | 
						|
    --------
 | 
						|
    build_table_schema : Inverse function.
 | 
						|
    pandas.read_json
 | 
						|
    """
 | 
						|
    table = loads(json, precise_float=precise_float)
 | 
						|
    col_order = [field["name"] for field in table["schema"]["fields"]]
 | 
						|
    df = DataFrame(table["data"], columns=col_order)[col_order]
 | 
						|
 | 
						|
    dtypes = {
 | 
						|
        field["name"]: convert_json_field_to_pandas_type(field)
 | 
						|
        for field in table["schema"]["fields"]
 | 
						|
    }
 | 
						|
 | 
						|
    # No ISO constructor for Timedelta as of yet, so need to raise
 | 
						|
    if "timedelta64" in dtypes.values():
 | 
						|
        raise NotImplementedError(
 | 
						|
            'table="orient" can not yet read ISO-formatted Timedelta data'
 | 
						|
        )
 | 
						|
 | 
						|
    df = df.astype(dtypes)
 | 
						|
 | 
						|
    if "primaryKey" in table["schema"]:
 | 
						|
        df = df.set_index(table["schema"]["primaryKey"])
 | 
						|
        if len(df.index.names) == 1:
 | 
						|
            if df.index.name == "index":
 | 
						|
                df.index.name = None
 | 
						|
        else:
 | 
						|
            df.index.names = [
 | 
						|
                None if x.startswith("level_") else x for x in df.index.names
 | 
						|
            ]
 | 
						|
 | 
						|
    return df
 |