Skip to main content

Schema Script

PurposeInside this document, users will discover a script dedicated to Schema operations, assisting them in acquiring a structural representation for configuring Tables.
CreatedDecember 11, 2023

from pandas.api.types import is_string_dtype, is_bool_dtype, is_datetime64_any_dtype, is_numeric_dtype
import pandas as pd
import datetime as dt
import numpy as np
import json
import random

def cast_var_to_string(x):
if type(x) == list:
for idx, item in enumerate(x):
x[idx]["Configurations"] = json.dumps(item["Configurations"])
x[idx]["Properties"] = json.dumps(item["Properties"])
return x


def recurse_cols(df: pd.DataFrame, name1=None, islist=False):
fields = []
for item in df.columns:
name = item
if name1 is not None:
name = name1
item_dict = {
"name":name,
"nullable":True,
"dict_id":0,
"dict_is_ordered":False,
"metadata":{}
}

df1 = df[item]

if is_bool_dtype(df1):
item_dict.update({
"data_type":"Boolean",
})
elif is_datetime64_any_dtype(df1):
item_dict.update({
"data_type":{"Timestamp":["Nanosecond","UTC"]},
})
elif is_numeric_dtype(df1):
item_dict.update({
"data_type":"Float64",
})
elif is_string_dtype(df1):
item_dict.update({
"data_type":"Utf8",
})

for idx, val in enumerate(df.loc[:, item]):
if (df[item].isna()[idx]):
continue
elif type(val) == list:
new_df = df[item].explode().reset_index(drop=True).to_frame()

item_dict.update({
"data_type": {
"List": recurse_cols(new_df, name1="item", islist=True)
}
})
break
elif type(val) == dict:
item_dict.update({
"data_type": {
"Struct": recurse_cols(pd.json_normalize(df[item], max_level=0))
}
})
break

if islist:
return item_dict

fields.append(item_dict)

return fields

## DEFINE YOUR DF HERE
fields = recurse_cols(df)
struct = {"fields": fields, "metadata": {}}
print(json.dumps(struct))