-
Notifications
You must be signed in to change notification settings - Fork 0
/
summary.py
52 lines (40 loc) · 1.75 KB
/
summary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import pandas as pd
import numpy as np
data=pd.read_pickle(r"C:\Users\rajneesh.jha\Downloads\deduction\GeorgiaPac.pkl")
print(data.shape)
# Replacing \N with np.nan`
data = data.replace('\\N', np.nan)
# Replacing None with np.nan
data = data.fillna(value=np.nan)
# Replacing the blank values
data = data.replace(r'^\s*$', np.nan, regex=True)
df=data
def get_summary(column_list):
# Creating the summary dataframe
summary_df = pd.DataFrame(columns=['column_name', 'null_percentage', 'no_of_uniques', 'value_counts'])
summary_df['column_name'] = column_list
# Creating summary
for col in column_list:
# null percentage
summary_df.loc[summary_df.column_name == col, 'null_percentage'] = str(
np.round((df[col].isnull().sum() / df.shape[0]) * 100, 2)) + ' % '
# number of uniques
summary_df.loc[summary_df.column_name == col, 'no_of_uniques'] = df[col].nunique()
# value_counts
## Creating the value_counts dataframe
vc = pd.DataFrame(df[col].value_counts(dropna=False).head(5)).reset_index()
vc.columns = [col, 'count']
vc['ratio'] = np.round(vc['count'] / len(df), decimals=2) * 100
## saving the dataframe in string list
l = []
for k in zip(vc[col].tolist(), vc['count'].tolist(), vc['ratio'].tolist()):
l.append(str(k[0]) + ' --> ' + str(k[1]) + ' --> ' + str(k[2]) + ' % ')
## merging the string list
s = ''
for val in l:
s = s + str(val) + ' || '
# [:-3] to remove last || and space
summary_df.loc[summary_df.column_name == col, 'value_counts'] = s[:-3]
return summary_df
summary = get_summary(data.columns.tolist())
print(summary)