Notebook Simple Write and Read

import pandas as pd
import numpy as np
import df_file_interchange as fi
from pathlib import Path

# Create a simple dataframe

df = pd.DataFrame(
    {
        "a": [1, 2, 3, 4, 5],
        "b": ["apples", "pears", "oranges", "bananas", "bears"],
        "c": [np.pi, 2*np.pi, 3*np.pi, 4*np.pi, 5*np.pi],
        "d": [
            np.datetime64("2010-01-31T10:23:01"),
            np.datetime64("2014-01-01T10:23:01"),
            np.datetime64("2018-02-28T10:23:01"),
            np.datetime64("2024-01-31T10:23:01"),
            np.datetime64("1999-01-31T23:59:59")]
    },
    index=pd.RangeIndex(start=10, stop=15, step=1),
)

df

	a	b	c	d
10	1	apples	3.141593	2010-01-31 10:23:01
11	2	pears	6.283185	2014-01-01 10:23:01
12	3	oranges	9.424778	2018-02-28 10:23:01
13	4	bananas	12.566371	2024-01-31 10:23:01
14	5	bears	15.707963	1999-01-31 23:59:59

data_dir = Path("./data/")
data_dir.mkdir(exist_ok=True)
datafile_csv_path = Path(data_dir / "tutorial_trying_out_a_save.csv")

# Write to a CSV file (file format determined by extension of datafile_csv_path)
metafile = fi.write_df_to_file(df, datafile_csv_path)

metafile

PosixPath('data/tutorial_trying_out_a_save.yaml')

# Read the dataframe back into df_reload along with metainfo in metainfo_reload
(df_reload, metainfo_reload) = fi.read_df(metafile)

df_reload

	a	b	c	d
10	1	apples	3.141593	2010-01-31 10:23:01
11	2	pears	6.283185	2014-01-01 10:23:01
12	3	oranges	9.424778	2018-02-28 10:23:01
13	4	bananas	12.566371	2024-01-31 10:23:01
14	5	bears	15.707963	1999-01-31 23:59:59

# The metainfo is supplied as a FIMetaInfo object, which contains as its
# attributes other objects.
metainfo_reload

FIMetainfo(datafile=PosixPath('tutorial_trying_out_a_save.csv'), file_format=<FIFileFormatEnum.csv: 'csv'>, format_version=1, hash='980eae93340cbcef0d111da0b439a5f8b58f64cf6ab6f923ecb3ce0e0da84e18', encoding=FIEncoding(csv=FIEncodingCSV(csv_allowed_na=['<NA>'], sep=',', na_rep='<NA>', keep_default_na=False, doublequote=True, quoting=2, float_precision='round_trip'), parq=FIEncodingParquet(engine='pyarrow', index=None), auto_convert_int_to_intna=True), custom_info=FIBaseCustomInfo(unstructured_data={}, classname='FIBaseCustomInfo'), serialized_dtypes={'a': {'dtype_str': 'int64', 'serialized_col_name': {'el': 'a', 'eltype': 'str'}}, 'b': {'dtype_str': 'object', 'serialized_col_name': {'el': 'b', 'eltype': 'str'}}, 'c': {'dtype_str': 'float64', 'serialized_col_name': {'el': 'c', 'eltype': 'str'}}, 'd': {'dtype_str': 'datetime64[ns]', 'serialized_col_name': {'el': 'd', 'eltype': 'str'}}}, index=FIRangeIndex(start=10, stop=15, step=1, name=None, dtype='int64', index_type='range'), columns=FIIndex(data=['a', 'b', 'c', 'd'], name=None, dtype='object', index_type='idx'))

# Lets have a quick look at what the YAML file contains (we'll come back to this
# in a different tutorial)
with open(metafile, 'r') as h_file:
    print(h_file.read())

# Metadata for <function safe_str_output at 0x7f5d696004a0>
---

columns:
  data:
    el:
    - el: a
      eltype: str
    - el: b
      eltype: str
    - el: c
      eltype: str
    - el: d
      eltype: str
    eltype: list
  dtype: object
  index_type: idx
  name: null
custom_info:
  classname: FIBaseCustomInfo
  unstructured_data: {}
datafile: tutorial_trying_out_a_save.csv
encoding:
  auto_convert_int_to_intna: true
  csv:
    csv_allowed_na:
    - <NA>
    doublequote: true
    float_precision: round_trip
    keep_default_na: false
    na_rep: <NA>
    quoting: 2
    sep: ','
  parq:
    engine: pyarrow
    index: null
file_format: csv
format_version: 1
hash: 980eae93340cbcef0d111da0b439a5f8b58f64cf6ab6f923ecb3ce0e0da84e18
index:
  dtype: int64
  index_type: range
  name: null
  start: 10
  step: 1
  stop: 15
serialized_dtypes:
  a:
    dtype_str: int64
    serialized_col_name:
      el: a
      eltype: str
  b:
    dtype_str: object
    serialized_col_name:
      el: b
      eltype: str
  c:
    dtype_str: float64
    serialized_col_name:
      el: c
      eltype: str
  d:
    dtype_str: datetime64[ns]
    serialized_col_name:
      el: d
      eltype: str

# Now we check the original dataframe, df, and df_reload read from disc are the
# same. We use chk_strict_frames_eq_ignore_nan() because, in this context, we
# want NaN == NaN (usually NaN != NaN)
fi.chk_strict_frames_eq_ignore_nan(df, df_reload)

True

# There are convenience functions to write CSV or Parquet explicitly
datafile_parq_path = Path(data_dir / "./tutorial_trying_out_a_save.parq")
fi.write_df_to_parquet(df, datafile_parq_path)

PosixPath('data/tutorial_trying_out_a_save.yaml')

# The file for the metainfo can be specified for the write but it must be in the
# same directory as teh data file. The output format can also be specified
# explicitly.
fi.write_df_to_file(df, datafile_csv_path, Path(data_dir / "tutorial_trying_out_a_save_diff_metafile.yaml"), file_format="csv")

PosixPath('data/tutorial_trying_out_a_save_diff_metafile.yaml')

# Additional encoding options can be supplied but this is almost never a good
# idea (the defaults were carefully chosen)
encoding_csv = fi.file.rw.FIEncodingCSV(sep=";")
encoding = fi.file.rw.FIEncoding(csv=encoding_csv)
metafile_new_sep = fi.write_df_to_file(df, Path(data_dir / "tutorial_tring_out_a_save_new_sep.csv"), encoding=encoding)
(df_new_sep, metainfo_new_sep) = fi.read_df(metafile_new_sep)
fi.chk_strict_frames_eq_ignore_nan(df, df_new_sep)

True