-
Notifications
You must be signed in to change notification settings - Fork 0
/
callback.py
152 lines (113 loc) · 4.75 KB
/
callback.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import os
import numpy as np
import pandas as pd
import requests
def waterproduced(src):
# Properly ready the data
df = pd.read_excel(src, sheet_name=None, skiprows=2, usecols=(4, 5, 6, 7, 8))
df = df["Planilha2"]
df = df.drop(df.tail(1).index)
# Sistema: SAA Guaratuba
# ETA: Todas
df = df.drop(columns=["Sistema", "ETA", "Dia da Semana"])
df = df.rename(columns={"Produzido (m3)": "water_produced", "Data": "timestamp"})
df.timestamp = pd.to_datetime(df.timestamp, format="%Y-%m-%d %H:%M:%S")
df = df.set_index("timestamp")
dst = os.path.basename(src)
dst = dst.replace(".xlsx", ".csv")
dst = dst.lower().replace(" ", "-")
return df, dst
def meteorological(src):
df = pd.read_excel(src)
df = df.rename(
columns={
"data": "timestamp",
"temperatura": "temperature",
"radiacao": "radiation",
"precipitacao": "precipitation",
"umidade_relativa": "relative_humidity",
}
)
# Feature engineering: create daily features based on the data
# Use df.timestamp.dt.date, otherwise some days without measurement at midnight
# won't be considered: https://github.com/pandas-dev/pandas/issues/16381
df.timestamp = pd.to_datetime(df.timestamp.dt.date, format="%Y-%m-%d %H:%M:%S")
df = df.set_index("timestamp")
new = pd.DataFrame()
# Following dates don't have measurement at 00:00:00, but have enough to resample
# 2017-01-14
# 2017-01-17
# 2018-01-01
# 2018-01-11
# Following dates don't have enough measurements to resample
# 2019-01-05
# 2019-01-06
# 2019-01-07
# 2019-01-08
# 2019-01-09
# TODO: this feature engineering should be here? If inside Source, data should be
# already daily sampled and we wouldn't be able to create the features
new["temperature_mean"] = df.temperature.resample("D").mean()
new["temperature_std"] = df.temperature.resample("D").std()
new["radiation_mean"] = df.radiation.resample("D").mean()
new["radiation_std"] = df.radiation.resample("D").std()
new["relative_humidity_mean"] = df.relative_humidity.resample("D").mean()
new["relative_humidity_std"] = df.relative_humidity.resample("D").std()
new["precipitation_mean"] = df.precipitation.resample("D").mean()
new["precipitation_std"] = df.precipitation.resample("D").std()
new = new.drop(pd.Timestamp("2019-01-09"))
new = new.drop(pd.Timestamp("2016-02-29"))
# Reindex to fill missing dates (it will also remove data outside new idx range)
idx = pd.date_range("2016-01-01", "2019-12-31", freq="D")
new = new.reindex(idx, fill_value=np.nan)
dst = os.path.basename(src)
dst = dst.replace(".xlsx", ".csv")
dst = dst.lower().replace(" ", "-")
return new, dst
# TODO: get from (?)
def holidays(city, state, years=("2016", "2017", "2018", "2019")):
# Holidays data are gathered from a web API. Data from Guaratuba, Curitiba and
# Joinville are gathered, from 2016 to 2019
jsntoken = "amVzdWluby52ZkBnbWFpbC5jb20maGFzaD0xMDMxODQ1MTQ"
dftmp = []
for year in years:
URL = (
f"https://api.calendario.com.br/?json=true&ano="
f"{year}&estado="
f"{state}&cidade="
f"{city}&token={jsntoken}"
)
response = requests.get(URL)
assert response.status_code == 200, f"Reponse code {response.status_code}"
dftmp += response.json()
df = pd.DataFrame(dftmp)
df = df.rename(columns={"date": "timestamp"})
df.timestamp = pd.to_datetime(df.timestamp, format="%d/%m/%Y")
df = df.set_index("timestamp")
df["is_holiday_" + city.lower()] = True
# Drop rows with holiday type = "Dia Convencional"
df = df[df.type != "Dia Convencional"]
# Drop not used columns
df = df.drop(
["link", "description", "type_code", "raw_description", "name", "type"],
axis=1,
)
# Drop duplicated rows (we're only interested if it's holiday or not)
df = df[~df.index.duplicated(keep="first")]
# Reindex to fill missing dates
idx = pd.date_range("2016-01-01", "2019-12-31", freq="D")
df = df.reindex(idx, fill_value=False)
dst = f"holidays-{city.lower()}.csv"
return df, dst
def manual(manualist, column, dst):
# Manual data includes carnival (with Friday, Saturday and Sunday before
# official holidays) and school recess
# Carnival includes Friday, Saturday and Sunday before official holidays
# School recess for the state of Parana
idx = pd.date_range("2016-01-01", "2019-12-31", freq="D")
df = pd.DataFrame({"timestamp": idx})
df = df.set_index("timestamp") # Avoid SettingWithCopyWarning
df[column] = False
ixs = df.index.isin(manualist)
df[column][ixs] = True
return df, dst