Skip to content

Code Reference

df_file_interchange.file.rw

The classes and functions in this module do the writing and reading.

Premable

df_file_interchange.file.rw.chk_strict_frames_eq_ignore_nan(df1: pd.DataFrame, df2: pd.DataFrame)

Check whether two dataframes are equal, ignoring NaNs

This may be expensive since we have to make a copy of the dataframes to avoid mangling the originals. Raises exception if dfs are unequal.

Parameters:

Name Type Description Default
df1 DataFrame
required
df2 DataFrame
required

Returns:

Type Description
bool

Always True.

Source code in df_file_interchange/file/rw.py
def chk_strict_frames_eq_ignore_nan(df1: pd.DataFrame, df2: pd.DataFrame):
    """Check whether two dataframes are equal, ignoring NaNs

    This may be expensive since we have to make a copy of the dataframes to
    avoid mangling the originals. Raises exception if dfs are unequal.

    Parameters
    ----------
    df1 : pd.DataFrame
    df2 : pd.DataFrame

    Returns
    -------
    bool
        Always True.
    """

    const_float = np.pi

    # Copy the dataframes because we do not want to modify the originals
    loc_df1 = df1.copy()
    loc_df2 = df2.copy()

    # Iterate through the columns of each df, and if the column dtype is float then we replace NaNs with a finite value
    # We don't seem to need to bother to do this for np.complex types, so this seems ok...
    col_list1 = []
    col_list2 = []
    for col in loc_df1:
        if loc_df1[col].dtype in ["float16", "float32", "float64"]:
            col_list1.append(col)
    for col in loc_df2:
        if loc_df2[col].dtype in ["float16", "float32", "float64"]:
            col_list2.append(col)

    d_col_list1 = {col: {np.nan: const_float} for col in col_list1}
    d_col_list2 = {col: {np.nan: const_float} for col in col_list2}
    loc_df1.replace(to_replace=d_col_list1, inplace=True)
    loc_df2.replace(to_replace=d_col_list2, inplace=True)

    # Finallly, we can do the test free from NaN != NaN issues.
    assert_frame_equal(
        loc_df1,
        loc_df2,
        check_dtype=True,
        check_index_type=True,
        check_column_type=True,
        check_categorical=True,
        check_frame_type=True,
        check_names=True,
        check_exact=True,
        check_freq=True,
        # check_flag=True,
    )

    return True

df_file_interchange.file.rw.FIFileFormatEnum

Bases: str, Enum

File formats used by file interchange

Source code in df_file_interchange/file/rw.py
class FIFileFormatEnum(str, Enum):
    """File formats used by file interchange"""

    csv = "csv"
    parquet = "parquet"

df_file_interchange.file.rw.FIIndexType

Bases: str, Enum

The type of an index, e.g. RangeIndex, Categorical, MultiIndex

Source code in df_file_interchange/file/rw.py
class FIIndexType(str, Enum):
    """The type of an index, e.g. RangeIndex, Categorical, MultiIndex"""

    base = "base"
    idx = "idx"  # Using literal "index" seems to cause a problem.
    range = "range"
    categorical = "categorical"
    multi = "multi"
    interval = "interval"
    datetime = "datetime"
    timedelta = "timedelta"
    period = "period"

Encoding Specifications

Encoding options can be specified in a FIEncoding object, which in turn contains FIEncodingCSV and FIEncodingParquet as attributes (only the object corresponding to the file format applies when writing). These all construct themselves with default options, it's usually ill-advised to change these.

df_file_interchange.file.rw.FIEncodingCSV

Bases: BaseModel

The parameters we use for writing or reading CSV files.

NOTE! You almost certainly do not have any reason to change these defaults. They were tested to ensure that the roundtrip write-read is exactly correct.

Attributes:

Name Type Description
csv_allowed_na list[str]

Default [""]. WE write all our files, so we can be more restrictive to reduce window for ambiguity when reading a file. In particualr, it's a bad idea to confuse NaN with a null value with a missing value with an empty value -- these are NOT the same, despite what "data science" conventions might suggest. If you must be awkward, try ["-NaN", "-nan", "", "N/A", "NA", "NULL", "NaN", "None", "n/a", "nan", "null"] noting that "" is not in that list (that does cause problems).

sep str

Default ",". Explictly define field separator

na_rep str

Default "". This must be in the csv_allowed_na list. What's used as the default na.

keep_default_na bool

Default False.

doublequote bool

Default True. How we're escaping quotes in a str.

quoting int

Default csv.QUOTE_NONNUMERIC. i.e. we only quote non-numeric values.

float_precision Literal['high', 'legacy', 'round_trip']

Default "round_trip". Weirdly, Pandas's other options, including the default, don't actually return what was written with floats.

Source code in df_file_interchange/file/rw.py
class FIEncodingCSV(BaseModel):
    """The parameters we use for writing or reading CSV files.

    NOTE! You almost certainly do not have any reason to change these defaults.
    They were tested to ensure that the roundtrip write-read is exactly correct.

    Attributes
    ----------
    csv_allowed_na : list[str]
        Default ["<NA>"]. WE write all our files, so we can be more restrictive
        to reduce window for ambiguity when reading a file. In particualr, it's
        a bad idea to confuse NaN with a null value with a missing value with an
        empty value -- these are NOT the same, despite what "data science"
        conventions might suggest. If you must be awkward, try ["-NaN", "-nan",
        "<NA>", "N/A", "NA", "NULL", "NaN", "None", "n/a", "nan", "null"] noting
        that "" is not in that list (that does cause problems).

    sep : str
        Default ",". Explictly define field separator

    na_rep: str
        Default "<NA>". This must be in the csv_allowed_na list. What's used as
        the default na.

    keep_default_na: bool
        Default False.

    doublequote: bool
        Default True. How we're escaping quotes in a str.

    quoting: int
        Default csv.QUOTE_NONNUMERIC. i.e. we only quote non-numeric values.

    float_precision: Literal["high", "legacy", "round_trip"]
        Default "round_trip". Weirdly, Pandas's other options, including the
        default, don't actually return what was written with floats.

    """

    csv_allowed_na: list[str] = ["<NA>"]
    sep: str = ","
    na_rep: str = "<NA>"
    keep_default_na: bool = False
    doublequote: bool = True
    quoting: int = csv.QUOTE_NONNUMERIC
    float_precision: Literal["high", "legacy", "round_trip"] = "round_trip"

    @model_validator(mode="after")
    def check_logic(self):
        if self.na_rep != "":
            if self.na_rep not in self.csv_allowed_na:
                error_msg = (
                    f"na_rep must be in csv_allowed_na. na_rep={safe_str_output(self.na_rep)};"
                    f" csv_allowed_na={safe_str_output(self.csv_allowed_na)}"
                )
                logger.error(error_msg)
                raise LookupError(error_msg)

        return self

df_file_interchange.file.rw.FIEncodingParquet

Bases: BaseModel

The parameters we used for writing parquet files

Again, there's really no need to change these.

Attributes:

Name Type Description
engine str

Default "pyarrow". Engine to use. Has to be consistent and was tested with pyarrow

index str | None

Default None. See https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_parquet.html#pandas.DataFrame.to_parquet

Source code in df_file_interchange/file/rw.py
class FIEncodingParquet(BaseModel):
    """The parameters we used for writing parquet files

    Again, there's really no need to change these.

    Attributes
    ----------
    engine : str
        Default "pyarrow". Engine to use. Has to be consistent and was tested
        with pyarrow

    index : str | None
        Default None. See
        https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_parquet.html#pandas.DataFrame.to_parquet
    """

    engine: str = "pyarrow"
    index: str | None = None

df_file_interchange.file.rw.FIEncoding

Bases: BaseModel

General encoding options, includes CSV and Parquet encoding

Attributes:

Name Type Description
csv FIEncodingCSV

Default FIEncodingCSV(). Extra options that depend on format

parq FIEncodingParquet

Default FIEncodingParquet(). Extra options that depend on format

auto_convert_int_to_intna bool

Default True. Whether to automatically convert standard int dtypes to Pandas's Int64Dtype (which can also encode NA values), if there are one or more NAs or None(s) in the column

Source code in df_file_interchange/file/rw.py
class FIEncoding(BaseModel):
    """General encoding options, includes CSV and Parquet encoding

    Attributes
    ----------
    csv : FIEncodingCSV
        Default FIEncodingCSV(). Extra options that depend on format

    parq : FIEncodingParquet
        Default FIEncodingParquet(). Extra options that depend on format

    auto_convert_int_to_intna : bool
        Default True. Whether to automatically convert standard int dtypes to
        Pandas's Int64Dtype (which can also encode NA values), if there are one
        or more NAs or None(s) in the column

    """

    csv: FIEncodingCSV = FIEncodingCSV()
    parq: FIEncodingParquet = FIEncodingParquet()
    auto_convert_int_to_intna: bool = True

Our Index Representation(s)

We have our own classes to represent Pandas indexes, which can perform operations such as serialization and instantiation (of the Pandas index). Everything here should derive from the FIBaseIndex base class.

df_file_interchange.file.rw.FIBaseIndex

Bases: BaseModel

Base class for our custom classes to be able to serialize/deserialize/instantiate Pandas indexes

This is derived from Pydantic BaseModel, so we can (and do) use those facilities.

Source code in df_file_interchange/file/rw.py
class FIBaseIndex(BaseModel):
    """Base class for our custom classes to be able to serialize/deserialize/instantiate Pandas indexes

    This is derived from Pydantic `BaseModel`, so we can (and do) use those
    facilities.
    """

    # TODO factory code to instantiate itself? (if possible from Pydantic model)

    @computed_field(title="index_type")
    @property
    def index_type(self) -> str:
        """Get the str name for the index (one of the FIIndex enum entires)"""

        return FIIndexType.base.name

    def get_fi_index_type(self) -> FIIndexType:
        """Get the index type (one of the FIIndex enum entires)"""

        return FIIndexType.base

    def get_as_index(self, **kwargs) -> pd.Index:
        """Creates corresponding Pandas index

        Params
        ------
        **kwargs : dict
            Not used at current time.

        Returns
        -------
        pd.Index
            The Pandas index created corresponding to our FIIndex type and data.
        """

        return pd.Index()

index_type: str property

Get the str name for the index (one of the FIIndex enum entires)

get_as_index(**kwargs) -> pd.Index

Creates corresponding Pandas index

Params

**kwargs : dict Not used at current time.

Returns:

Type Description
Index

The Pandas index created corresponding to our FIIndex type and data.

Source code in df_file_interchange/file/rw.py
def get_as_index(self, **kwargs) -> pd.Index:
    """Creates corresponding Pandas index

    Params
    ------
    **kwargs : dict
        Not used at current time.

    Returns
    -------
    pd.Index
        The Pandas index created corresponding to our FIIndex type and data.
    """

    return pd.Index()

get_fi_index_type() -> FIIndexType

Get the index type (one of the FIIndex enum entires)

Source code in df_file_interchange/file/rw.py
def get_fi_index_type(self) -> FIIndexType:
    """Get the index type (one of the FIIndex enum entires)"""

    return FIIndexType.base

df_file_interchange.file.rw.FIIndex

Bases: FIBaseIndex

Corresonds to pd.Index

See https://pandas.pydata.org/docs/reference/api/pandas.Index.html

Attributes:

Name Type Description
data ArrayLike | AnyArrayLike | list | tuple

The enumerated elements in the index.

name str | None = None

Optional name.

dtype Dtype | DtypeObj | ExtensionDtype | None

Dtype of the elemenets.

Source code in df_file_interchange/file/rw.py
class FIIndex(FIBaseIndex):
    """Corresonds to pd.Index

    See https://pandas.pydata.org/docs/reference/api/pandas.Index.html

    Attributes
    ----------

    data : ArrayLike | AnyArrayLike | list | tuple
        The enumerated elements in the index.

    name : str | None = None
        Optional name.

    dtype : Dtype | DtypeObj | pd.api.extensions.ExtensionDtype | None
        Dtype of the elemenets.

    """

    model_config = ConfigDict(arbitrary_types_allowed=True)

    data: ArrayLike | AnyArrayLike | list | tuple
    name: str | None = None
    dtype: Dtype | DtypeObj | pd.api.extensions.ExtensionDtype | None

    @computed_field(title="index_type")
    @property
    def index_type(self) -> str:
        """Get the str name for the index (one of the FIIndex enum entires)"""

        return FIIndexType.idx.name

    def get_fi_index_type(self) -> str:
        """Get the index type (one of the FIIndex enum entires)"""

        return FIIndexType.idx

    def get_as_index(self, **kwargs) -> pd.Index:
        """Creates corresponding Pandas index

        Returns
        -------
        pd.Index
            The Pandas index created corresponding to our FIIndex type and data.
        """
        return pd.Index(
            data=self.data,
            name=self.name,
            dtype=self.dtype,
            copy=True,
        )

    @field_serializer("data", when_used="always")
    def serialize_data(self, data: ArrayLike | AnyArrayLike | list | tuple):
        return _serialize_element(list(data))

    @field_serializer("dtype", when_used="always")
    def serialize_index_type(self, dtype: Dtype | None):
        return str(dtype)

    @model_validator(mode="before")
    @classmethod
    def pre_process(cls, data: Any) -> Any:
        if isinstance(data, dict):
            if (
                "data" in data.keys()
                and isinstance(data["data"], dict)
                and "el" in data["data"].keys()
                and "eltype" in data["data"].keys()
            ):
                data["data"] = _deserialize_element(data["data"])

        return data

index_type: str property

Get the str name for the index (one of the FIIndex enum entires)

get_as_index(**kwargs) -> pd.Index

Creates corresponding Pandas index

Returns:

Type Description
Index

The Pandas index created corresponding to our FIIndex type and data.

Source code in df_file_interchange/file/rw.py
def get_as_index(self, **kwargs) -> pd.Index:
    """Creates corresponding Pandas index

    Returns
    -------
    pd.Index
        The Pandas index created corresponding to our FIIndex type and data.
    """
    return pd.Index(
        data=self.data,
        name=self.name,
        dtype=self.dtype,
        copy=True,
    )

get_fi_index_type() -> str

Get the index type (one of the FIIndex enum entires)

Source code in df_file_interchange/file/rw.py
def get_fi_index_type(self) -> str:
    """Get the index type (one of the FIIndex enum entires)"""

    return FIIndexType.idx

df_file_interchange.file.rw.FIRangeIndex

Bases: FIBaseIndex

Corresonds to pd.RangeIndex

See https://pandas.pydata.org/docs/reference/api/pandas.RangeIndex.html

Attributes:

Name Type Description
start int

Where index starts counting from.

stop int

Where index stops counting.

step int

Step that index counts in.

name str | None

Optional name. Default None.

dtype DtypeObj | ExtensionDtype | str | None

Dtype of the index.

Source code in df_file_interchange/file/rw.py
class FIRangeIndex(FIBaseIndex):
    """Corresonds to pd.RangeIndex

    See https://pandas.pydata.org/docs/reference/api/pandas.RangeIndex.html

    Attributes
    ----------

    start : int
        Where index starts counting from.

    stop : int
        Where index stops counting.

    step : int
        Step that index counts in.

    name : str | None
        Optional name. Default None.

    dtype : DtypeObj | pd.api.extensions.ExtensionDtype | str | None
        Dtype of the index.

    """

    model_config = ConfigDict(arbitrary_types_allowed=True)

    start: int
    stop: int
    step: int
    name: str | None = None
    dtype: DtypeObj | pd.api.extensions.ExtensionDtype | str | None

    @computed_field(title="index_type")
    @property
    def index_type(self) -> str:
        """Get the str name for the index (one of the FIIndex enum entires)"""

        return FIIndexType.range.name

    def get_fi_index_type(self) -> str:
        """Get the index type (one of the FIIndex enum entires)"""

        return FIIndexType.range

    def get_as_index(self, **kwargs) -> pd.RangeIndex:
        """Creates corresponding Pandas index

        Returns
        -------
        pd.RangeIndex
            The Pandas index created corresponding to our FIIndex type and data.
        """

        return pd.RangeIndex(
            start=self.start,
            stop=self.stop,
            step=self.step,
            name=self.name,
            dtype=self.dtype,
        )

    @field_serializer("dtype", when_used="always")
    def serialize_dtype(self, dtype: Dtype | None):
        return str(dtype)

index_type: str property

Get the str name for the index (one of the FIIndex enum entires)

get_as_index(**kwargs) -> pd.RangeIndex

Creates corresponding Pandas index

Returns:

Type Description
RangeIndex

The Pandas index created corresponding to our FIIndex type and data.

Source code in df_file_interchange/file/rw.py
def get_as_index(self, **kwargs) -> pd.RangeIndex:
    """Creates corresponding Pandas index

    Returns
    -------
    pd.RangeIndex
        The Pandas index created corresponding to our FIIndex type and data.
    """

    return pd.RangeIndex(
        start=self.start,
        stop=self.stop,
        step=self.step,
        name=self.name,
        dtype=self.dtype,
    )

get_fi_index_type() -> str

Get the index type (one of the FIIndex enum entires)

Source code in df_file_interchange/file/rw.py
def get_fi_index_type(self) -> str:
    """Get the index type (one of the FIIndex enum entires)"""

    return FIIndexType.range

df_file_interchange.file.rw.FICategoricalIndex

Bases: FIBaseIndex

Corresonds to pd.CategoricalIndex

See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.CategoricalIndex.html

Attributes:

Name Type Description
data ArrayLike | AnyArrayLike | list | tuple

Elements in index.

categories ArrayLike | AnyArrayLike | list | tuple

List from which elements in data must belong.

ordered bool

Whether data should be ordered?

name str | None

Optional name. Default None.

dtype DtypeObj | ExtensionDtype | CategoricalDtype | str | None

Dtype of elements.

Source code in df_file_interchange/file/rw.py
class FICategoricalIndex(FIBaseIndex):
    """Corresonds to pd.CategoricalIndex

    See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.CategoricalIndex.html

    Attributes
    ----------

    data : ArrayLike | AnyArrayLike | list | tuple
        Elements in index.

    categories : ArrayLike | AnyArrayLike | list | tuple
        List from which elements in data must belong.

    ordered : bool
        Whether data should be ordered?

    name : str | None
        Optional name. Default None.

    dtype : DtypeObj | pd.api.extensions.ExtensionDtype | pd.CategoricalDtype | str | None
        Dtype of elements.

    """

    model_config = ConfigDict(arbitrary_types_allowed=True)

    data: ArrayLike | AnyArrayLike | list | tuple
    categories: ArrayLike | AnyArrayLike | list | tuple
    ordered: bool
    name: str | None = None
    dtype: (
        DtypeObj | pd.api.extensions.ExtensionDtype | pd.CategoricalDtype | str | None
    )

    @computed_field(title="index_type")
    @property
    def index_type(self) -> str:
        """Get the str name for the index (one of the FIIndex enum entires)"""

        return FIIndexType.categorical.name

    def get_fi_index_type(self) -> str:
        """Get the index type (one of the FIIndex enum entires)"""

        return FIIndexType.categorical

    def get_as_index(self, **kwargs) -> pd.CategoricalIndex:
        """Creates corresponding Pandas index

        Returns
        -------
        pd.CategoricalIndex
            The Pandas index created corresponding to our FIIndex type and data.
        """

        return pd.CategoricalIndex(
            data=self.data,
            categories=self.categories,
            ordered=self.ordered,
            name=self.name,
            dtype=self.dtype,
            copy=True,
        )

    @field_serializer("data", when_used="always")
    def serialize_data(self, data: ArrayLike | AnyArrayLike | list | tuple):
        return _serialize_element(list(data))

    @field_serializer("categories", when_used="always")
    def serialize_categories(self, categories: ArrayLike | AnyArrayLike | list | tuple):
        return _serialize_element(list(categories))

    @field_serializer("dtype", when_used="always")
    def serialize_dtype(self, dtype: Dtype | None):
        return str(dtype)

    @model_validator(mode="before")
    @classmethod
    def pre_process(cls, data: Any) -> Any:
        if isinstance(data, dict):
            if (
                "data" in data.keys()
                and isinstance(data["data"], dict)
                and "el" in data["data"].keys()
                and "eltype" in data["data"].keys()
            ):
                data["data"] = _deserialize_element(data["data"])

            if (
                "categories" in data.keys()
                and isinstance(data["categories"], dict)
                and "el" in data["categories"].keys()
                and "eltype" in data["categories"].keys()
            ):
                data["categories"] = _deserialize_element(data["categories"])

        return data

index_type: str property

Get the str name for the index (one of the FIIndex enum entires)

get_as_index(**kwargs) -> pd.CategoricalIndex

Creates corresponding Pandas index

Returns:

Type Description
CategoricalIndex

The Pandas index created corresponding to our FIIndex type and data.

Source code in df_file_interchange/file/rw.py
def get_as_index(self, **kwargs) -> pd.CategoricalIndex:
    """Creates corresponding Pandas index

    Returns
    -------
    pd.CategoricalIndex
        The Pandas index created corresponding to our FIIndex type and data.
    """

    return pd.CategoricalIndex(
        data=self.data,
        categories=self.categories,
        ordered=self.ordered,
        name=self.name,
        dtype=self.dtype,
        copy=True,
    )

get_fi_index_type() -> str

Get the index type (one of the FIIndex enum entires)

Source code in df_file_interchange/file/rw.py
def get_fi_index_type(self) -> str:
    """Get the index type (one of the FIIndex enum entires)"""

    return FIIndexType.categorical

df_file_interchange.file.rw.FIMultiIndex

Bases: FIBaseIndex

Corresponds to pd.MultiIndex

See https://pandas.pydata.org/docs/reference/api/pandas.MultiIndex.html and https://pandas.pydata.org/docs/user_guide/advanced.html

Attributes:

Name Type Description
levels list

The number of levels in the multiindex.

codes list

The list of lists (I think), of the elements in the index.

sortorder int | None

Default None.

names list

List of names for the levels.

dtypes Series | list

Dtype specifications.

Source code in df_file_interchange/file/rw.py
class FIMultiIndex(FIBaseIndex):
    """Corresponds to pd.MultiIndex

    See https://pandas.pydata.org/docs/reference/api/pandas.MultiIndex.html and
    https://pandas.pydata.org/docs/user_guide/advanced.html

    Attributes
    ----------

    levels : list
        The number of levels in the multiindex.

    codes : list
        The list of lists (I think), of the elements in the index.

    sortorder : int | None
        Default None.

    names : list
        List of names for the levels.

    dtypes : pd.Series | list
        Dtype specifications.

    """

    model_config = ConfigDict(arbitrary_types_allowed=True)

    levels: list
    codes: list
    sortorder: int | None = None
    names: list
    dtypes: pd.Series | list  # Hmmm.

    # Need some extra validation logic to ensure FrozenList(s) contain what is expected

    @computed_field(title="index_type")
    @property
    def index_type(self) -> str:
        """Get the str name for the index (one of the FIIndex enum entires)"""

        return FIIndexType.multi.name

    def get_fi_index_type(self) -> str:
        """Get the index type (one of the FIIndex enum entires)"""

        return FIIndexType.multi

    def get_as_index(self, **kwargs) -> pd.MultiIndex:
        """Creates corresponding Pandas index

        Returns
        -------
        pd.MultiIndex
            The Pandas index created corresponding to our FIIndex type and data.
        """

        return pd.MultiIndex(
            levels=self.levels,
            codes=self.codes,
            sortorder=self.sortorder,
            names=self.names,
            dtype=self.dtypes,  # Not used in Pandas source
            copy=True,
            verify_integrity=True,
        )

    @field_serializer("levels", when_used="always")
    def serialize_levels(self, levels: list):
        loc_levels = []
        for level in levels:
            loc_levels.append(_serialize_element(level))

        return loc_levels

    @field_serializer("codes", when_used="always")
    def serialize_codes(self, codes: list):
        loc_codes = []
        for code in codes:
            loc_codes.append(_serialize_element(code))

        return loc_codes

    @field_serializer("names", when_used="always")
    def serialize_names(self, names: list):
        if isinstance(names, np.ndarray):
            return names.tolist()
        else:
            return list(names)

    @field_serializer("dtypes", when_used="always")
    def serialize_dtypes(self, dtypes: pd.Series | list):
        # Ouch.
        return list(map(str, list(dtypes)))

    @model_validator(mode="before")
    @classmethod
    def pre_process(cls, data: Any) -> Any:
        if isinstance(data, dict):
            # Check if data provided is a "true" data array or if it's serialized from before
            if (
                "levels" in data.keys()
                and len(data["levels"]) > 0
                and isinstance(data["levels"], list)
            ):
                loc_levels = []
                for cur_level in data["levels"]:
                    # Need to test whether we're deserializing or de novo construction
                    if (
                        isinstance(cur_level, dict)
                        and "el" in cur_level.keys()
                        and "eltype" in cur_level.keys()
                    ):
                        loc_levels.append(_deserialize_element(cur_level))
                    else:
                        loc_levels.append(cur_level)

                data["levels"] = loc_levels

            if (
                "codes" in data.keys()
                and len(data["codes"]) > 0
                and isinstance(data["codes"], list)
            ):
                loc_codes = []
                for cur_code in data["codes"]:
                    # Need to test whether we're deserializing or de novo construction
                    if (
                        isinstance(cur_code, dict)
                        and "el" in cur_code.keys()
                        and "eltype" in cur_code.keys()
                    ):
                        loc_codes.append(_deserialize_element(cur_code))
                    else:
                        loc_codes.append(cur_code)

                data["codes"] = loc_codes

        return data

index_type: str property

Get the str name for the index (one of the FIIndex enum entires)

get_as_index(**kwargs) -> pd.MultiIndex

Creates corresponding Pandas index

Returns:

Type Description
MultiIndex

The Pandas index created corresponding to our FIIndex type and data.

Source code in df_file_interchange/file/rw.py
def get_as_index(self, **kwargs) -> pd.MultiIndex:
    """Creates corresponding Pandas index

    Returns
    -------
    pd.MultiIndex
        The Pandas index created corresponding to our FIIndex type and data.
    """

    return pd.MultiIndex(
        levels=self.levels,
        codes=self.codes,
        sortorder=self.sortorder,
        names=self.names,
        dtype=self.dtypes,  # Not used in Pandas source
        copy=True,
        verify_integrity=True,
    )

get_fi_index_type() -> str

Get the index type (one of the FIIndex enum entires)

Source code in df_file_interchange/file/rw.py
def get_fi_index_type(self) -> str:
    """Get the index type (one of the FIIndex enum entires)"""

    return FIIndexType.multi

df_file_interchange.file.rw.FIIntervalIndex

Bases: FIBaseIndex

Corresponds to pd.IntervalIndex

See https://pandas.pydata.org/docs/reference/api/pandas.IntervalIndex.html

Attributes:

Name Type Description
data IntervalArray | ndarray

The data array (of intervals!).

closed IntervalClosedType

How each interval is closed or not: "left", "right", "closed", "neither".

name str or None

Optional name. Default None.

dtype IntervalDtype | str | None
Source code in df_file_interchange/file/rw.py
class FIIntervalIndex(FIBaseIndex):
    """Corresponds to pd.IntervalIndex

    See https://pandas.pydata.org/docs/reference/api/pandas.IntervalIndex.html

    Attributes
    ----------

    data : pd.arrays.IntervalArray | np.ndarray
        The data array (of intervals!).

    closed : IntervalClosedType
        How each interval is closed or not: "left", "right", "closed", "neither".

    name : str or None
        Optional name. Default None.

    dtype : pd.IntervalDtype | str | None

    """

    model_config = ConfigDict(arbitrary_types_allowed=True)

    data: pd.arrays.IntervalArray | np.ndarray
    closed: IntervalClosedType
    name: str | None = None
    dtype: pd.IntervalDtype | str | None

    @computed_field(title="index_type")
    @property
    def index_type(self) -> str:
        """Get the str name for the index (one of the FIIndex enum entires)"""

        return FIIndexType.interval.name

    def get_fi_index_type(self) -> str:
        """Get the index type (one of the FIIndex enum entires)"""

        return FIIndexType.interval

    def get_as_index(self, **kwargs) -> pd.IntervalIndex:
        """Creates corresponding Pandas index

        Returns
        -------
        pd.IntervalIndex
            The Pandas index created corresponding to our FIIndex type and data.
        """

        return pd.IntervalIndex(
            data=self.data,  # type: ignore
            closed=self.closed,
            name=self.name,
            dtype=self.dtype,  # type: ignore
            copy=True,
        )

    @field_serializer("data", when_used="always")
    def serialize_data(self, data: pd.arrays.IntervalArray | np.ndarray):
        return _serialize_element(list(data))

    @field_serializer("dtype", when_used="always")
    def serialize_dtype(self, dtype: Dtype | None):
        return str(dtype)

    @model_validator(mode="before")
    @classmethod
    def pre_process(cls, data: Any) -> Any:
        if isinstance(data, dict):
            if (
                "data" in data.keys()
                and isinstance(data["data"], dict)
                and "el" in data["data"].keys()
                and "eltype" in data["data"].keys()
            ):
                data["data"] = _deserialize_element(data["data"])

                # Force IntervalArray
                data["data"] = pd.arrays.IntervalArray(data["data"])

        return data

index_type: str property

Get the str name for the index (one of the FIIndex enum entires)

get_as_index(**kwargs) -> pd.IntervalIndex

Creates corresponding Pandas index

Returns:

Type Description
IntervalIndex

The Pandas index created corresponding to our FIIndex type and data.

Source code in df_file_interchange/file/rw.py
def get_as_index(self, **kwargs) -> pd.IntervalIndex:
    """Creates corresponding Pandas index

    Returns
    -------
    pd.IntervalIndex
        The Pandas index created corresponding to our FIIndex type and data.
    """

    return pd.IntervalIndex(
        data=self.data,  # type: ignore
        closed=self.closed,
        name=self.name,
        dtype=self.dtype,  # type: ignore
        copy=True,
    )

get_fi_index_type() -> str

Get the index type (one of the FIIndex enum entires)

Source code in df_file_interchange/file/rw.py
def get_fi_index_type(self) -> str:
    """Get the index type (one of the FIIndex enum entires)"""

    return FIIndexType.interval

df_file_interchange.file.rw.FIDatetimeIndex

Bases: FIBaseIndex

Corresponds to pd.DatetimeIndex

See https://pandas.pydata.org/docs/reference/api/pandas.DatetimeIndex.html

Attributes:

Name Type Description
data ArrayLike | AnyArrayLike | list | tuple

Array of datetimes.

freq _Frequency | None = None

Optional frequency. See Pandas docs for what this means.

tz tzinfo | str | None

Optional tz.

name str | None = None

Optional name.

dtype Dtype | str | None
Source code in df_file_interchange/file/rw.py
class FIDatetimeIndex(FIBaseIndex):
    """Corresponds to pd.DatetimeIndex

    See https://pandas.pydata.org/docs/reference/api/pandas.DatetimeIndex.html

    Attributes
    ----------

    data: ArrayLike | AnyArrayLike | list | tuple
        Array of datetimes.

    freq: _Frequency | None = None
        Optional frequency. See Pandas docs for what this means.

    tz: tzinfo | str | None
        Optional tz.

    name: str | None = None
        Optional name.

    dtype: Dtype | str | None

    """

    model_config = ConfigDict(arbitrary_types_allowed=True)

    data: ArrayLike | AnyArrayLike | list | tuple
    freq: _Frequency | None = None
    tz: tzinfo | str | None  # most what it should be from pandas src
    name: str | None = None
    dtype: Dtype | str | None  # Hmmm.

    @computed_field(title="index_type")
    @property
    def index_type(self) -> str:
        """Get the str name for the index (one of the FIIndex enum entires)"""

        return FIIndexType.datetime.name

    def get_fi_index_type(self) -> str:
        """Get the index type (one of the FIIndex enum entires)"""

        return FIIndexType.datetime

    def get_as_index(self, **kwargs) -> pd.DatetimeIndex:
        """Creates corresponding Pandas index

        Returns
        -------
        pd.DatetimeIndex
            The Pandas index created corresponding to our FIIndex type and data.
        """

        return pd.DatetimeIndex(
            data=self.data,
            freq=self.freq,
            tz=self.tz,
            name=self.name,
            dtype=self.dtype,
            copy=True,
        )

    @field_serializer("data", when_used="always")
    def serialize_data(self, data: ArrayLike | AnyArrayLike | list | tuple):
        return _serialize_element(data)

    @field_serializer("freq", when_used="always")
    def serialize_freq(self, freq):
        if self.freq is None:
            return None
        else:
            return freq.freqstr

    @field_serializer("tz", when_used="always")
    def serialize_tz(self, tz):
        if self.tz is None:
            return None
        else:
            return str(self.tz)

    @field_serializer("dtype", when_used="always")
    def serialize_dtype(self, dtype: Dtype | None):
        return str(dtype)

    @model_validator(mode="before")
    @classmethod
    def pre_process(cls, data: Any) -> Any:
        if isinstance(data, dict):
            # Check if data provided is a "true" data array or if it's serialized from before
            if (
                "data" in data.keys()
                and isinstance(data["data"], dict)
                and "el" in data["data"].keys()
                and "eltype" in data["data"].keys()
            ):
                data["data"] = _deserialize_element(data["data"])

        return data

index_type: str property

Get the str name for the index (one of the FIIndex enum entires)

get_as_index(**kwargs) -> pd.DatetimeIndex

Creates corresponding Pandas index

Returns:

Type Description
DatetimeIndex

The Pandas index created corresponding to our FIIndex type and data.

Source code in df_file_interchange/file/rw.py
def get_as_index(self, **kwargs) -> pd.DatetimeIndex:
    """Creates corresponding Pandas index

    Returns
    -------
    pd.DatetimeIndex
        The Pandas index created corresponding to our FIIndex type and data.
    """

    return pd.DatetimeIndex(
        data=self.data,
        freq=self.freq,
        tz=self.tz,
        name=self.name,
        dtype=self.dtype,
        copy=True,
    )

get_fi_index_type() -> str

Get the index type (one of the FIIndex enum entires)

Source code in df_file_interchange/file/rw.py
def get_fi_index_type(self) -> str:
    """Get the index type (one of the FIIndex enum entires)"""

    return FIIndexType.datetime

df_file_interchange.file.rw.FITimedeltaIndex

Bases: FIBaseIndex

Corresponds to pd.TimedeltaIndex

See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.TimedeltaIndex.html

Attributes:

Name Type Description
data ArrayLike | AnyArrayLike | list | tuple

Array of timedeltas.

freq str | BaseOffset | None = None

Optional frequency. See Pandas docs for details.

name str | None = None

Optional name.

dtype DtypeObj | TimeDelta64DType | Literal['<m8[ns]'] | str | None
Source code in df_file_interchange/file/rw.py
class FITimedeltaIndex(FIBaseIndex):
    """Corresponds to pd.TimedeltaIndex

    See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.TimedeltaIndex.html

    Attributes
    ----------

    data : ArrayLike | AnyArrayLike | list | tuple
        Array of timedeltas.

    freq : str | BaseOffset | None = None
        Optional frequency. See Pandas docs for details.

    name : str | None = None
        Optional name.

    dtype : DtypeObj | np.dtypes.TimeDelta64DType | Literal["<m8[ns]"] | str | None

    """

    model_config = ConfigDict(arbitrary_types_allowed=True)

    data: ArrayLike | AnyArrayLike | list | tuple
    freq: str | BaseOffset | None = None
    name: str | None = None
    dtype: (
        DtypeObj | np.dtypes.TimeDelta64DType | Literal["<m8[ns]"] | str | None
    )  # Hmmm.

    @computed_field(title="index_type")
    @property
    def index_type(self) -> str:
        """Get the str name for the index (one of the FIIndex enum entires)"""

        return FIIndexType.timedelta.name

    def get_fi_index_type(self) -> str:
        """Get the index type (one of the FIIndex enum entires)"""

        return FIIndexType.timedelta

    def get_as_index(self, **kwargs) -> pd.TimedeltaIndex:
        """Creates corresponding Pandas index

        Returns
        -------
        pd.TimedeltaIndex
            The Pandas index created corresponding to our FIIndex type and data.
        """

        return pd.TimedeltaIndex(
            data=self.data,  # type: ignore
            freq=self.freq,  # type: ignore
            name=self.name,  # type: ignore
            dtype=self.dtype,  # type: ignore
            copy=True,
        )

    @field_serializer("data", when_used="always")
    def serialize_data(self, data: ArrayLike | AnyArrayLike | list | tuple):
        return _serialize_element(list(data))

    @field_serializer("freq", when_used="always")
    def serialize_freq(self, freq):
        if self.freq is None:
            return None
        else:
            return freq.freqstr

    @field_serializer("dtype", when_used="always")
    def serialize_dtype(self, dtype: Dtype | None):
        return str(dtype)

    @model_validator(mode="before")
    @classmethod
    def pre_process(cls, data: Any) -> Any:
        if isinstance(data, dict):
            if (
                "data" in data.keys()
                and isinstance(data["data"], dict)
                and "el" in data["data"].keys()
                and "eltype" in data["data"].keys()
            ):
                data["data"] = _deserialize_element(data["data"])

        return data

index_type: str property

Get the str name for the index (one of the FIIndex enum entires)

get_as_index(**kwargs) -> pd.TimedeltaIndex

Creates corresponding Pandas index

Returns:

Type Description
TimedeltaIndex

The Pandas index created corresponding to our FIIndex type and data.

Source code in df_file_interchange/file/rw.py
def get_as_index(self, **kwargs) -> pd.TimedeltaIndex:
    """Creates corresponding Pandas index

    Returns
    -------
    pd.TimedeltaIndex
        The Pandas index created corresponding to our FIIndex type and data.
    """

    return pd.TimedeltaIndex(
        data=self.data,  # type: ignore
        freq=self.freq,  # type: ignore
        name=self.name,  # type: ignore
        dtype=self.dtype,  # type: ignore
        copy=True,
    )

get_fi_index_type() -> str

Get the index type (one of the FIIndex enum entires)

Source code in df_file_interchange/file/rw.py
def get_fi_index_type(self) -> str:
    """Get the index type (one of the FIIndex enum entires)"""

    return FIIndexType.timedelta

df_file_interchange.file.rw.FIPeriodIndex

Bases: FIBaseIndex

Corresponds to pd.PeriodIndex

See https://pandas.pydata.org/docs/reference/api/pandas.PeriodIndex.html

data: ArrayLike | AnyArrayLike | list | tuple Array of periods.

freq: _Frequency | None = None Optional frequency. See Pandas docs.

name: str | None = None Optional name

dtype: DtypeObj | pd.PeriodDtype | str | None # Hmmm.

Source code in df_file_interchange/file/rw.py
class FIPeriodIndex(FIBaseIndex):
    """Corresponds to pd.PeriodIndex

    See https://pandas.pydata.org/docs/reference/api/pandas.PeriodIndex.html

    data: ArrayLike | AnyArrayLike | list | tuple
        Array of periods.

    freq: _Frequency | None = None
        Optional frequency. See Pandas docs.

    name: str | None = None
        Optional name

    dtype: DtypeObj | pd.PeriodDtype | str | None  # Hmmm.

    """

    model_config = ConfigDict(arbitrary_types_allowed=True)

    data: ArrayLike | AnyArrayLike | list | tuple
    freq: _Frequency | None = None
    name: str | None = None
    dtype: DtypeObj | pd.PeriodDtype | str | None  # Hmmm.

    @computed_field(title="index_type")
    @property
    def index_type(self) -> str:
        """Get the str name for the index (one of the FIIndex enum entires)"""

        return FIIndexType.period.name

    def get_fi_index_type(self) -> str:
        """Get the index type (one of the FIIndex enum entires)"""

        return FIIndexType.period

    def get_as_index(self, **kwargs) -> pd.PeriodIndex:
        """Creates corresponding Pandas index

        Returns
        -------
        pd.PeriodIndex
            The Pandas index created corresponding to our FIIndex type and data.
        """

        return pd.PeriodIndex(
            data=self.data,
            # freq=self.freq,   -- disabled because it seems to mess things up, info included in dtype and freq is old notation (QE-DEC instead of Q-DEC)
            name=self.name,
            dtype=self.dtype,
            copy=True,
        )

    @field_serializer("data", when_used="always")
    def serialize_data(self, data: ArrayLike | AnyArrayLike | list | tuple):
        return _serialize_element(data)

    @field_serializer("freq", when_used="always")
    def serialize_freq(self, freq):
        if self.freq is None:
            return None
        else:
            return freq.freqstr

    @field_serializer("dtype", when_used="always")
    def serialize_dtype(self, dtype: Dtype | None):
        return str(dtype)

    @model_validator(mode="before")
    @classmethod
    def pre_process(cls, data: Any) -> Any:
        if isinstance(data, dict):
            if (
                "data" in data.keys()
                and isinstance(data["data"], dict)
                and "el" in data["data"].keys()
                and "eltype" in data["data"].keys()
            ):
                data["data"] = _deserialize_element(data["data"])

        return data

index_type: str property

Get the str name for the index (one of the FIIndex enum entires)

get_as_index(**kwargs) -> pd.PeriodIndex

Creates corresponding Pandas index

Returns:

Type Description
PeriodIndex

The Pandas index created corresponding to our FIIndex type and data.

Source code in df_file_interchange/file/rw.py
def get_as_index(self, **kwargs) -> pd.PeriodIndex:
    """Creates corresponding Pandas index

    Returns
    -------
    pd.PeriodIndex
        The Pandas index created corresponding to our FIIndex type and data.
    """

    return pd.PeriodIndex(
        data=self.data,
        # freq=self.freq,   -- disabled because it seems to mess things up, info included in dtype and freq is old notation (QE-DEC instead of Q-DEC)
        name=self.name,
        dtype=self.dtype,
        copy=True,
    )

get_fi_index_type() -> str

Get the index type (one of the FIIndex enum entires)

Source code in df_file_interchange/file/rw.py
def get_fi_index_type(self) -> str:
    """Get the index type (one of the FIIndex enum entires)"""

    return FIIndexType.period

df_file_interchange.file.rw.FIMetainfo

Bases: BaseModel

All the collected metadata we use when saving or loading

N.B. The order of the attributes is important in the sense that the serialization automatically preserves the order, and then yaml.dump() does too. This means we can make the YAML file a little easier to read/parse by a human.

Attributes:

Name Type Description
datafile Path

Ironically, this should always just be the filename with no paths

file_format FIFileFormatEnum

The file format of datafile.

format_version int

Default 1. Not really used yet but we might need to version the YAML file.

hash str | None

SHA256 hash of the datafile.

encoding FIEncoding

How the datafile was or is to be encoded.

custom_info SerializeAsAny[FIBaseCustomInfo]

Structured custom info. Can just be an empty FIBaseCustomInfo object.

serialized_dtypes dict

Dtypes of the dataframe.

index FIBaseIndex

Index information encoded as a FIBaseIndex object (descendent thereof).

columns FIBaseIndex

Columns, again, specified as an FIIndex object

Source code in df_file_interchange/file/rw.py
class FIMetainfo(BaseModel):
    """All the collected metadata we use when saving or loading

    N.B. The _order_ of the attributes is important in the sense that the
    serialization automatically preserves the order, and then `yaml.dump()` does
    too. This means we can make the YAML file a little easier to read/parse by a
    human.

    Attributes
    ----------

    datafile : Path
        Ironically, this should always just be the filename with no paths

    file_format : FIFileFormatEnum
        The file format of datafile.

    format_version: int
        Default 1. Not really used yet but we might need to version the YAML file.

    hash: str | None
        SHA256 hash of the datafile.

    encoding: FIEncoding
        How the datafile was or is to be encoded.

    custom_info: SerializeAsAny[FIBaseCustomInfo]
        Structured custom info. Can just be an empty FIBaseCustomInfo object.

    serialized_dtypes: dict
        Dtypes of the dataframe.

    index: FIBaseIndex
        Index information encoded as a FIBaseIndex object (descendent thereof).

    columns: FIBaseIndex
        Columns, again, specified as an FIIndex object

    """

    model_config = ConfigDict(arbitrary_types_allowed=True)

    # Ironically, this should always just be the filename with no paths
    datafile: Path

    # File format
    file_format: FIFileFormatEnum

    # Format version
    format_version: int = 1

    # SHA256 hash
    hash: str | None = None

    # Encoding
    encoding: FIEncoding

    # Custom info (user defined metainfo)
    custom_info: SerializeAsAny[FIBaseCustomInfo]

    # Serialized dtypes
    serialized_dtypes: dict

    # Index information encoded as a FIIndex object
    index: FIBaseIndex

    # Columns, again, as an FIIndex object
    columns: FIBaseIndex

    @field_serializer("datafile", when_used="always")
    def serialize_datafile(self, datafile: Path):
        return str(datafile)

    @field_serializer("file_format", when_used="always")
    def serialize_file_format(self, file_format: FIFileFormatEnum):
        return file_format.name

    @field_serializer("index", when_used="always")
    def serialize_index(self, index: FIBaseIndex):
        # TODO is this ok if caller does a model_dump_json()?
        return index.model_dump()

    @field_serializer("columns", when_used="always")
    def serialize_columns(self, columns: FIBaseIndex):
        # TODO is this ok if caller does a model_dump_json()?
        return columns.model_dump()

    @field_validator("custom_info", mode="before")
    @classmethod
    def validator_custom_info(
        cls, value: dict | FIBaseCustomInfo, info: ValidationInfo
    ) -> FIBaseCustomInfo:
        # Shortcut exit, if we've been passed something with extra_info already
        # instantiated. We only deal with dicts here.
        if not isinstance(value, dict):
            return value

        # By default we don't use a context
        clss_custom_info = None

        # If we don't have context, just use the base class or return as-is
        if info.context and isinstance(info.context, dict):
            # Get the available classes for extra_info (this should also be a
            # dictionary)
            clss_custom_info = info.context.get(
                "clss_custom_info", {"FIBaseCustomInfo": FIBaseCustomInfo}
            )
            assert isinstance(clss_custom_info, dict)

        # Now process
        value_classname = value.get("classname", None)
        if (
            value_classname
            and clss_custom_info is not None
            and value_classname in clss_custom_info.keys()
        ):
            # Now instantiate the model
            custom_info_class = clss_custom_info[value_classname]
        elif value_classname in globals().keys() and issubclass(
            globals()[value_classname], FIBaseCustomInfo
        ):
            custom_info_class = globals()[value_classname]
        else:
            error_msg = f"Neither context for supplied classname nor is it a subclass of FIBaseCustomInfo. classname={safe_str_output(value_classname)}"
            logger.error(error_msg)
            raise TypeError(error_msg)

        assert issubclass(custom_info_class, FIBaseCustomInfo)
        return custom_info_class.model_validate(value, context=info.context)

    @model_validator(mode="before")
    @classmethod
    def pre_process(cls, data: Any) -> Any:
        # TODO perhaps move index and columns into separate field validators.

        if isinstance(data, dict):
            # Need to ensure the index and columns and custominfo are created as
            # the correct object type, not just instantiating the base class.
            if "index" in data.keys() and isinstance(data["index"], dict):
                data["index"] = _deserialize_index_dict_to_fi_index(data["index"])

            if "columns" in data.keys() and isinstance(data["columns"], dict):
                data["columns"] = _deserialize_index_dict_to_fi_index(data["columns"])

        return data

The Write and Read Functions

These are what are exposed to the user, to roundtrip write and read dataframes.

df_file_interchange.file.rw.write_df_to_file(df: pd.DataFrame, datafile: Path | str, metafile: Path | str | None = None, file_format: FIFileFormatEnum | Literal['csv', 'parquet'] | None = None, encoding: FIEncoding | None = None, custom_info: FIBaseCustomInfo | dict = {}, preprocess_inplace=True) -> Path

Writes a dataframe to file

Parameters:

Name Type Description Default
df DataFrame

The dataframe to save.

required
datafile Path or str

The datafile to save the dataframe to.

required
metafile Path or str or None(optional)

Metafile name, can be only the filename or with a path (which must be the same as for datafile). If not supplied or None, will be determined automatically.

None
file_format FIFileFormatEnum | Literal['csv', 'parquet'] | None

The file format. If not supplied will be determined automatically.

None
encoding FIEncoding | None

Datafile encoding options.

None
custom_info FIBaseCustomInfo or dict

Custom user metadata to be stored. IF supplied as a FIBaseCustomInfo (or descendent) then it stores things properly. If supplied as a dict, then will create a FIBaseCustomInfo class and store the dictionary in the unstructured_data field.

{}
preprocess_inplace bool
True

Returns:

Type Description
Path

A Path object with the metainfo filename in it.

Source code in df_file_interchange/file/rw.py
def write_df_to_file(
    df: pd.DataFrame,
    datafile: Path | str,
    metafile: Path | str | None = None,
    file_format: FIFileFormatEnum | Literal["csv", "parquet"] | None = None,
    encoding: FIEncoding | None = None,
    custom_info: FIBaseCustomInfo | dict = {},
    preprocess_inplace=True,
) -> Path:
    """Writes a dataframe to file

    Parameters
    ----------
    df : pd.DataFrame
        The dataframe to save.
    datafile : Path or str
        The datafile to save the dataframe to.
    metafile : Path or str or None (optional)
        Metafile name, can be only the filename or with a path (which must be
        the same as for datafile). If not supplied or None, will be determined
        automatically.
    file_format : FIFileFormatEnum | Literal['csv', 'parquet'] | None
        The file format. If not supplied will be determined automatically.
    encoding : FIEncoding | None, optional
        Datafile encoding options.
    custom_info : FIBaseCustomInfo or dict
        Custom user metadata to be stored. IF supplied as a FIBaseCustomInfo (or
        descendent) then it stores things properly. If supplied as a dict, then
        will create a FIBaseCustomInfo class and store the dictionary in the
        `unstructured_data` field.
    preprocess_inplace : bool, optional

    Returns
    -------
    Path
        A Path object with the metainfo filename in it.

    """

    # Check types and existence correct for datafile and metafile
    if not isinstance(datafile, (Path, str)):
        error_msg = f"datafile must be a Path or str. Got type={type(datafile)}, value={safe_str_output(datafile)}"
        logger.error(error_msg)
        raise TypeError(error_msg)

    if metafile is not None and not isinstance(metafile, (Path, str)):
        error_msg = "When metafile given (not None), it must be a Path or str."
        logger.error(error_msg)
        raise TypeError(error_msg)

    # Cast datafile and metafile to str
    if isinstance(datafile, str):
        datafile = Path(datafile)

    if isinstance(metafile, str):
        metafile = Path(metafile)

    # Determine output format
    if file_format is None:
        loc_file_format = _detect_file_format_from_filename(datafile)
    else:
        loc_file_format = FIFileFormatEnum(file_format)

    # Determine metafile name
    loc_metafile = _check_metafile_name(datafile, metafile)

    # If we've got encoding parameters, use them; otherwise use defaults
    if encoding is None:
        encoding = FIEncoding()

    # Preprocess
    if preprocess_inplace:
        _preprocess_inplace(df, encoding)
        loc_df = df
    else:
        loc_df = _preprocess_safe(df, encoding)

    # Deal with custom info situation
    if isinstance(custom_info, dict):
        # We create FIBaseCustomInfo ourselves, and assign the dictionary into
        # unstructured_data
        loc_custom_info = FIBaseCustomInfo.model_validate(
            {"unstructured_data": custom_info}
        )
    elif isinstance(custom_info, FIBaseCustomInfo):
        loc_custom_info = custom_info
    else:
        raise TypeError("custom_info must be a dict or descendent of FIBaseCustomInfo")

    # Write to the data file
    if loc_file_format == FIFileFormatEnum.csv:
        _write_to_csv(loc_df, datafile, encoding)
    elif loc_file_format == FIFileFormatEnum.parquet:
        _write_to_parquet(loc_df, datafile, encoding)
    else:
        error_msg = "Output format not supported. This shouldn't happen."
        logger.error(error_msg)
        raise ValueError(error_msg)

    # Calculate the file's hash
    with open(datafile, "rb") as h_datafile:
        digest = hashlib.file_digest(h_datafile, "sha256")
    hash = digest.hexdigest()

    # Compile all the metainfo into a dictionary
    metainfo = _compile_metainfo(
        datafile=datafile,
        file_format=loc_file_format,
        hash=hash,
        encoding=encoding,
        custom_info=loc_custom_info,
        df=loc_df,
    )

    # Write metafile
    _write_metafile(datafile, loc_metafile, metainfo)

    return loc_metafile

df_file_interchange.file.rw.write_df_to_csv(df: pd.DataFrame, datafile: Path | str, encoding: FIEncoding | None = None, custom_info: FIBaseCustomInfo | dict = {}, preprocess_inplace=True) -> Path

Simplified wrapper around write_df_to_file() to write dataframe to CSV

Parameters:

Name Type Description Default
df DataFrame

The dataframe.

required
datafile Path or str

Target datafile.

required
encoding FIEncoding | None

Encoding specs, can be left None for defaults.

None
custom_info dict

Any custom meta data.

{}
preprocess_inplace bool

Whether to do preprocessing inplace (might modify original), by default True

True

Returns:

Type Description
Path

A Path object with the metainfo filename in it.

Source code in df_file_interchange/file/rw.py
def write_df_to_csv(
    df: pd.DataFrame,
    datafile: Path | str,
    encoding: FIEncoding | None = None,
    custom_info: FIBaseCustomInfo | dict = {},
    preprocess_inplace=True,
) -> Path:
    """Simplified wrapper around `write_df_to_file()` to write dataframe to CSV

    Parameters
    ----------
    df : pd.DataFrame
        The dataframe.
    datafile : Path or str
        Target datafile.
    encoding : FIEncoding | None, optional
        Encoding specs, can be left None for defaults.
    custom_info : dict, optional
        Any custom meta data.
    preprocess_inplace : bool, optional
        Whether to do preprocessing inplace (might modify original), by default True

    Returns
    -------
    Path
        A Path object with the metainfo filename in it.
    """

    return write_df_to_file(
        df=df,
        datafile=datafile,
        metafile=None,
        file_format=FIFileFormatEnum.csv,
        encoding=encoding,
        custom_info=custom_info,
        preprocess_inplace=preprocess_inplace,
    )

df_file_interchange.file.rw.read_df(metafile: Path | str, strict_hash_check: bool = True, context_metainfo: dict | None = None) -> tuple[pd.DataFrame, FIMetainfo]

Load a dataframe from file

Supply the metainfo filename, not the datafilename.

Parameters:

Name Type Description Default
metafile Path

The YAML file that is associated with the datafile.

required
strict_hash_check bool

Whether we raise an exception if the hash is wrong.

True
context_metainfo dict | None

If manually supplying a context to decode the structured custom info, by default None (in which was subclass type checks are used).

None

Returns:

Type Description
tuple[pd.DataFrame, FIMetainfo]:

A tuple with the dataframe and the metainfo object.

Source code in df_file_interchange/file/rw.py
def read_df(
    metafile: Path | str,
    strict_hash_check: bool = True,
    context_metainfo: dict | None = None,
) -> tuple[pd.DataFrame, FIMetainfo]:
    """Load a dataframe from file

    Supply the metainfo filename, not the datafilename.

    Parameters
    ----------
    metafile : Path
        The YAML file that is associated with the datafile.
    strict_hash_check : bool, optional
        Whether we raise an exception if the hash is wrong.
    context_metainfo : dict | None, optional
        If manually supplying a context to decode the structured custom info, by
        default None (in which was subclass type checks are used).

    Returns
    -------
    tuple[pd.DataFrame, FIMetainfo]:
        A tuple with the dataframe and the metainfo object.
    """

    # Check metafile not empty and correct types
    if not isinstance(metafile, (Path, str)):
        error_msg = f"metafile must be a Path or str. Got type={type(metafile)}, value={safe_str_output(metafile)}"
        logger.error(error_msg)
        raise TypeError(error_msg)

    if isinstance(metafile, str):
        metafile = Path(metafile)

    # Load metainfo
    metainfo = _read_metafile(metafile, context=context_metainfo)

    # Check datafile's hash
    datafile_abs = Path(metafile.parent / metainfo.datafile).resolve()
    with open(datafile_abs, "rb") as h_datafile:
        digest = hashlib.file_digest(h_datafile, "sha256")
    hash = digest.hexdigest()
    if hash != metainfo.hash:
        error_msg = f"Hash comparison failed. metainfo.hash={safe_str_output(metainfo.hash)}, calcualted hash={safe_str_output(hash)}."
        if strict_hash_check:
            logger.error(error_msg)
            raise ValueError(error_msg)
        else:
            logger.warning(error_msg)

    # Need to know number of columns
    if isinstance(metainfo.index, FIMultiIndex):
        num_index_cols = len(metainfo.index.levels)
    else:
        num_index_cols = 1

    if isinstance(metainfo.columns, FIMultiIndex):
        num_index_rows = len(metainfo.columns.levels)
    else:
        num_index_rows = 1

    # Load the data
    if metainfo.file_format == FIFileFormatEnum.csv:
        df = _read_from_csv(
            datafile_abs,
            metainfo.encoding,
            dtypes=metainfo.serialized_dtypes,
            num_index_cols=num_index_cols,
            num_index_rows=num_index_rows,
        )
    elif metainfo.file_format == FIFileFormatEnum.parquet:
        df = _read_from_parquet(datafile_abs, metainfo.encoding)
    else:
        error_msg = f"Input format ({safe_str_output(metainfo.file_format)}) not supported. We only support CSV and Parquet."
        logger.error(error_msg)
        raise ValueError(error_msg)

    # Apply index and columns
    df.index = metainfo.index.get_as_index()
    df.columns = metainfo.columns.get_as_index()

    # Apply dtypes
    _apply_serialized_dtypes(df, metainfo.serialized_dtypes)

    return (df, metainfo)