-
Notifications
You must be signed in to change notification settings - Fork 0
/
Sub_station_clustering.R
322 lines (239 loc) · 12.3 KB
/
Sub_station_clustering.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
require(ggplot2)
require(tidyverse)
require(cluster)
require(factoextra)
require(rdist)
require(gridExtra)
#import data
load("January_2013.RData")
load("January_2014.RData")
load("January_2015.RData")
Characteristics <- read.csv("Characteristics.csv", stringsAsFactors = FALSE)
#QUESTION 1: Initial Data Analysis Tasks
#1.a: Summarise the data in the CHaracteristics dataset and plot the distributions
summary(Characteristics)
#Transformer_RATING - size of total power being delivered
#Percentage_IC - percentage of industrial and commercial customers
#TRANSFORMER_TYPE - ground or pole mounted
library(hrbrthemes)
p<- Characteristics %>%
ggplot(aes(x=Percentage_IC))+
geom_histogram(fill = '#69b3a2', color = "#e9ecef", alpha = 0.9)+
ggtitle("Distribution of industrial and commercial customers")+
theme_ipsum()+
theme(plot.title = element_text(size=12))+
xlab("Percentage of industrial and commercial customers")+
ylab("Frequency of customers")
p
p1<- ggplot(data = Characteristics)+
geom_bar(mapping= aes(x=TRANSFORMER_TYPE), fill = '#69b3a2',
color = "#e9ecef", alpha = 0.9)+
ggtitle("Type of Transformer")+
theme_ipsum()+
theme(plot.title = element_text(size=12))+
xlab("")+
ylab("Frequency")
p1
p2<- Characteristics %>%
ggplot(aes(x=Transformer_RATING))+
geom_histogram(binwidth = 150, fill = '#69b3a2', color = "#e9ecef", alpha = 0.9)+
ggtitle("Distribution of total power delivered")+
theme_ipsum()+
theme(plot.title = element_text(size=12))+
xlab("Transformer Rating")+
ylab("Frequency")
p2
p3<- Characteristics %>%
ggplot(aes(x=TOTAL_CUSTOMERS))+
geom_histogram(fill = '#69b3a2', color = "#e9ecef", alpha = 0.9)+
ggtitle("Number of customers served")+
theme_ipsum()+
theme(plot.title = element_text(size=12))+
xlab("Customers served")+
ylab("Frequency")
p3
library(gridExtra)
grid.arrange(p, p1, p2, p3, ncol=2)
## Qu 1 ii) Describe the relationships between the different substation characteristics
library(GGally)
pairplot <- ggpairs(Characteristics[,2:6], title="Correlogram of substation characteristics")
corrviz <- ggcorr(Characteristics[,3:6], method = c('everything', 'pearson'))
#only significant correlation is between the Total Customers and Feeder Count....
##QUESTION 2: Using scaled daily measurements from Jan 2013 dataset perform hierarchical clustering for the daily average demand
##Qu 2i)
#1. scale daily measurements - actual measurement divided by the daily max
#remove na's
January_2013 <- na.omit(January_2013)
#as dataframe
January_2013 <- as.data.frame(January_2013)
#remove the label data and convert to matrix ready for scaling
df_13 <- data.matrix(January_2013[,3:146])
#create daily maxs
daily_maximums <- apply(df_13,1,max)
#scale by daily max
scaled_data <- t(scale(t(df_13), center = FALSE, scale = daily_maximums))
scaled_data2 <- df_13/matrix(daily_maximums, ncol = ncol(df_13), nrow = nrow(df_13), byrow = FALSE)
#check both methods give the same result
sum(scaled_data - scaled_data2)
####################################################################
#TIDY, MERGE and AGGREGATE FOR MEAN DAILY PROFILES PER SUBSTATION
#add substation name and date back with an index to merge on
January_2013$index <- 1:nrow(January_2013)
scaled_data2 <- as.data.frame(scaled_data2)
scaled_data2$index <- 1:nrow(scaled_data2)
#move index column of scaled data2 to beginning to subset and merge
scaled_data2 <- scaled_data2 %>% select(index, everything())
January_2013 <- January_2013 %>% select(index, everything())
#merge scaled data with substation and date labels by index column
January_2013_scaled <- merge(scaled_data2, January_2013[,c(1:3)], by='index')
January_2013_scaled <- January_2013_scaled %>% select(Date, Substation, everything())
#aggregate and average by substation
January_2013_s_stav <- aggregate(January_2013_scaled[,4:147], by = list(January_2013_scaled$Substation), FUN = mean, na.rm =TRUE)
#rename column
colnames(January_2013_s_stav)[colnames(January_2013_s_stav) == 'Group.1'] <- 'Substation'
###################################OLD CODE ####################################
#find daily average scaled demands
#create a distance matrix - just on the demand numbers not inlucluding substation
dm<- dist(as.matrix(January_2013_s_stav[,2:145]))
#prepare hierarchical cluster
hc = hclust(dm)
#dendogram
plot(hc, hang = -1) #hang place the labels all at the same level
#cut the dendrogram to give 9 clusters
clusters9 <-cutree(hc, 9)
#view cluster allocation
table(January_2013_s_stav$Substation, clusters9) #crosstab of 9 cluster cut by Substation
table(clusters9)
#QUESTION 2.iii) a) For each cluster (facet) plot the demand for 1) all days, 2) weekdays 3) saturdays 4) sundays
#recombine the cluster labels with the original dataset
January_2013_stav_clusters <- cbind(January_2013_s_stav, clusters9)
#move cluster label to beginning
January_2013_stav_clusters <- January_2013_stav_clusters %>% select(clusters9, everything())
#make data long to be able to plot
January_2013_stav_clusters_long <- reshape(January_2013_stav_clusters, direction = 'long',
varying = list(names(January_2013_stav_clusters)[3:146]), v.names = 'demand')
#FINAL PLOT
p_alldays <- ggplot(January_2013_stav_clusters_long, aes(x = time, y = demand, colour = factor(clusters9)))+
geom_smooth()+
ggtitle("Daily average demand by cluster group")+
facet_wrap(~clusters9)
#See how the points vary
p_alldays2 <- ggplot(January_2013_stav_clusters_long, aes(x = time, y = demand, colour = factor(clusters9)))+
geom_point()+
ggtitle("Daily average demand by cluster group")+
facet_wrap(~clusters9)
p_alldays2
##APPLY SPline smoothing with df=48 so smoothed by every half hour
library(splines2)
theme_set(theme_minimal())
p_alldays3 <- ggplot(January_2013_stav_clusters_long, aes(x = time, y = demand, colour = factor(clusters9)))+
geom_smooth(method= lm, formula = y~ splines2::bSpline(x, df =48))+
ggtitle("Daily average demand by cluster group")+
facet_wrap(~clusters9)
p_alldays3+theme(legend.position = 'none')
#########################################################################
##QU 2 iii) Plot weekdays, saturdays and sundays
#rename column
#colnames(January_2013_s_stav)[colnames(January_2013_s_stav) == 'Group.1'] <- 'Substation'
#merge data with cluster labels back onto original dataset to have date-substation pairs
Jan13_clust_date <- merge(January_2013_scaled, January_2013_stav_clusters[c("Substation","clusters9")], by="Substation", all=FALSE)
#move cluster label to beginning
Jan13_clust_date <- Jan13_clust_date %>% select(clusters9, everything())
#make data long to be able to plot
Jan13_clust_date_long <- reshape(Jan13_clust_date, direction = 'long',
varying = list(names(Jan13_clust_date)[5:148]), v.names = 'demand')
#create day of the week column
Jan13_clust_date_long$day_of_week <- weekdays(Jan13_clust_date_long$Date)
Jan13_clust_date_long$day_of_week[Jan13_clust_date_long$day_of_week %in% c('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday')] <- 'Weekday'
theme_set(theme_minimal())
p_dayofweeks <- ggplot(Jan13_clust_date_long, aes(x = time, y = demand, group = day_of_week, colour = day_of_week))+
geom_smooth(method= lm, formula = y~ splines2::bSpline(x, df =48))+
ggtitle("Daily average demand by cluster group for different days of week")+
facet_wrap(~clusters9)
p_dayofweeks
ob#################################################################################
## QU 2.4
#produce summaries of the variables in characteristics and describe clusters
summary(subset(Characteristics, Jan13_clust_date_long$clusters9==1))
summary(subset(Characteristics, Jan13_clust_date_long$clusters9==2))
summary(subset(Characteristics, Jan13_clust_date_long$clusters9==3))
summary(subset(Characteristics, Jan13_clust_date_long$clusters9==4))
summary(subset(Characteristics, Jan13_clust_date_long$clusters9==5))
summary(subset(Characteristics, Jan13_clust_date_long$clusters9==6))
summary(subset(Characteristics, Jan13_clust_date_long$clusters9==7))
summary(subset(Characteristics, Jan13_clust_date_long$clusters9==8))
summary(subset(Characteristics, Jan13_clust_date_long$clusters9==9))
#find mean values for each cluster
Characteristics2 <- Characteristics %>% rename(Substation = SUBSTATION_NUMBER)
final_data <- merge(January_2013_stav_clusters[c("Substation", "clusters9")], Characteristics2, by = "Substation", all=FALSE)
aggregate(final_data[,4:7], by=list(clusters9=final_data$clusters9), mean)
################################################################################
##QU 3 Allocate New Substations
# qu3 i) plot for each new substation all days, weekdays, saturdays and sundays
#scale the daily measurements
Newsubstations <- read.csv("NewSubstations.csv", stringsAsFactors = FALSE)
Newsubstations <- as.data.frame(Newsubstations)
Newsub_scale <- Newsubstations[,3:146]/apply(Newsubstations[,3:146],1,max)
Newsub_scale <- cbind(Newsubstations[,1:2], Newsub_scale)
#convert date column to date format
library(lubridate)
Newsub_scale$Date <- ymd(Newsub_scale$Date)
Newsub_scale_day <- Newsub_scale
Newsub_scale_day$day_of_week <- weekdays(Newsub_scale$Date)
#reshape to long to plot
Newsub_scale_long <- reshape(Newsub_scale_day, direction = 'long', varying = list(names(Newsub_scale)[3:146]), v.names = 'demand')
Newsub_scale_long$day_of_week[Newsub_scale_long$day_of_week %in% c('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday')] <- 'Weekday'
#All days
theme_set(theme_minimal())
newss_alldays <-
ggplot(Newsub_scale_long, aes(x = time, y= demand))+
geom_smooth(method = lm, formula = y~splines2::bSpline(x, df = 48))+
ggtitle('Daily demand for newsubstations for all days')+
facet_wrap(~Substation)
newss_alldays
##Day Comparison PLOT
theme_set(theme_minimal())
newss_weekdays <- ggplot(Newsub_scale_long, aes(x=time, y = demand, group = day_of_week, colour = day_of_week))+
geom_smooth(method = lm, formula = y~splines2::bSpline(x, df = 48))+
ggtitle('Daily demand for new substations for different days of the week')+
facet_wrap(~Substation)
newss_weekdays
newss_weekdays +theme(legend.position = 'none')
##qu 3 ii)
library(cluster)
library(factoextra)
library(rdist)
Newsub_scale_daily <- aggregate(Newsub_scale, by = list(Newsub_scale$Substation), FUN = 'mean')
#Do kmeans via cclust for 9 clusters
library(cclust)
set.seed(612)
kmeanscc <- cclust(as.matrix(January_2013_scaled[,4:147]),9,dist="euclidean",verbose=TRUE,method="kmeans")
kmeanscc$cluster
#Allocated new subsatations to a clusters
newcc <- predict(kmeanscc,as.matrix(Newsub_scale_daily[,4:147]))
newcc$cluster
###MERGE THE KMEANS CLUSTERS BACK ONTO JANUARY2013 spreadsheet
#plot the new clusters and compare with qu3 i
#merge cluster numbers back onto datasets
Newsubstations_ccluster <- data.frame(Newsubstations, newcc$cluster)
January_2013_ccluster <- data.frame(January_2013, kmeanscc$cluster)
#plot main dataset to see profiles of clusters and compare with plots from 3i to see how they compare
#cluster column called kmeanscc.cluster / data in col 5:148
#move cluster label to beginning
January_2013_ccluster <- January_2013_ccluster %>% select(kmeanscc.cluster, everything()) %>% rename(c_cluster = kmeanscc.cluster)
#make data long to be able to plot
January_2013_ccluster_long <- reshape(January_2013_ccluster, direction = 'long',
varying = list(names(January_2013_ccluster)[5:148]), v.names = 'demand')
#plot demand profiles
##APPLY SPline smoothing with df=48 so smoothed by every half hour
library(splines2)
theme_set(theme_minimal())
p_cclust_all <- ggplot(January_2013_ccluster_long, aes(x = time, y = demand, colour = c_cluster))+
geom_smooth(method= lm, formula = y~ splines2::bSpline(x, df =48))+
ggtitle("Daily average demand by k-means cluster group")+
facet_wrap(~c_cluster)
p_cclust_all <- p_cclust_all+theme(legend.position = 'none')
#view new substations and clustered substations together to see if allocated to relevant clusters
library(gridExtra)
grid.arrange(newss_weekdays, p_cclust_all, ncol=1, nrow=2)
#looks like a good allocation