
library(jsonlite)
library(dplyr)
library(utils)
require(nCov2019)
path="~/Dropbox/covid-19/R/"

####################################################################
####################### Functions to ease data manipulation ########
####################################################################

rev.time<-function(df){
  out= df %>% group_by(province) %>% arrange(province,time)
  as.data.frame(out)
}

aggregate.time<-function(df,ctry,pr){
  df2=df %>% group_by(time) %>% 
    summarise_at(vars(cum_confirm,cum_dead,confirm,dead),funs(sum(.))) 
  df2=df2 %>% mutate(country=ctry,province=pr)
  as.data.frame(df2)[,names(df)]
}

  
add.daily<-function(df){
  tmp.df=as.data.frame(df %>% group_by(province) %>%
                         mutate(confirm=cum_confirm-lag(cum_confirm),
                       dead=cum_dead-lag(cum_dead)))
  tmp.df$confirm[is.na(tmp.df$confirm)]=tmp.df$cum_confirm[is.na(tmp.df$confirm)]
  tmp.df$dead[is.na(tmp.df$dead)]=tmp.df$cum_dead[is.na(tmp.df$dead)]
  tmp.df
}

add.cum<-function(df){
  out = df %>% group_by(province) %>% mutate(cum_confirm=cumsum(confirm),cum_dead=cumsum(dead))
  as.data.frame(out)[,c("time","country","province",
                        "cum_confirm","cum_dead","confirm","dead")]
}


##################################################################

ratio=.1
skip=FALSE
# FALSE = "ratio" of small cities, provinces or regions are aggregated
# TRUE  =  "ratio" of small cities, provinces are discarded

#################################################################


########################################################################################
########################### Data from ECDC international data df.glob   ################
########################################################################################


data <- read.csv("https://opendata.ecdc.europa.eu/covid19/casedistribution/csv", 
                              na.strings = "", fileEncoding = "UTF-8-BOM")

tps=as.Date(data$dateRep,format="%d/%m/%Y")

df.glob=data.frame(time=tps,country=data$countriesAndTerritories,
              confirm=data$cases,dead=data$deaths)

df.glob<-(df.glob %>% 
    mutate(province='none'))[,c("time","country","province",
                                "confirm","dead")]
df.glob=rev.time(df.glob)

df.glob = df.glob %>% 
  group_by(country) %>% 
  mutate(cum_confirm=cumsum(confirm),cum_dead=cumsum(dead))
df.glob=as.data.frame(df.glob)[,c("time","country","province","cum_confirm",
                      "cum_dead","confirm","dead")]


################################################################
###############   Data for Hubei province df.hubei  ############
################################################################


nc <- get_nCov2019()
big.df <- load_nCov2019(lang = 'en', source='github')

#################  Skip small cities

df.insidehubei=filter(big.df["city"],province=="Hubei")
df.insidehubei=df.insidehubei[,c("time","country","city","cum_confirm","cum_dead")]
names(df.insidehubei)[3]="province"
df.insidehubei =add.daily(filter(df.insidehubei,!is.na(province)))

if (ratio>0){
  end=rev(unique(sort(df.insidehubei$time)))[1]
  tmp=arrange(filter(df.insidehubei,time==end-2 ),cum_dead)
  cities.keep=c(tmp$province[cumsum(tmp$cum_dead)/sum(tmp$cum_dead)>ratio])
  df.insidehubei=filter(df.insidehubei,province %in% cities.keep)
}


################  Aggregation of other provinces

df.prov=filter(big.df["province"])
df.prov=df.prov[,c("time","country","province","cum_confirm","cum_dead")]
df.prov = add.daily(filter(df.prov,!is.na(province)))
df.hubei=filter(df.prov,province=='Hubei')
if (skip){
 df.nothubei=NULL
} else { 
 df.nothubei=filter(df.prov,province!='Hubei')
 df.nothubei=aggregate.time(df.nothubei,"China","RestOfProvinces")
}

########## Final step

df.hubei=rbind(df.insidehubei,df.hubei,df.nothubei)
df.china=filter(big.df["global"],country=="China")
df.china<-(df.china %>% mutate(province="none"))
df.china=df.china[,c("time","country","province","cum_confirm","cum_dead")]
df.china=add.daily(df.china)
df.hubei=rbind(df.hubei,df.china)
df.hubei=df.hubei[-with(df.hubei,which(dead<0 | confirm<0)),]


#####################################################################################
######################### Data for Canada df.can     ################################
#####################################################################################

################ Provinces Canada

df.can <- read.csv(url("https://sante-infobase.canada.ca/src/data/covidLive/covid19.csv"))
df.can$country=rep("Canada",nrow(df.can))
df.can=df.can[,c("date","country","prname","numtotal","numdeaths")]
df.can$prname<-as.character(df.can$prname)
names(df.can)<-c("time","country","province","cum_confirm","cum_dead")
df.can$cum_confirm[is.na(df.can$cum_confirm)]=0
df.can$cum_dead[is.na(df.can$cum_dead)]=0
tmp=as.character(df.can$time)
df.can$time=as.Date(tmp,format="%d-%m-%y")
df.onlyCan=filter(df.can,province == "Canada")
df.onlyCan=add.daily(df.onlyCan)
df.can=filter(df.can,province != "Canada")
df.can=add.daily(df.can)

if (ratio>0){
  end=rev(sort(unique(df.can$time)))[1]
  tmp=na.omit(arrange(filter(df.can,time==end-2),cum_dead))
  D=sum(tmp$cum_dead)
  prov.keep=tmp$province[cumsum(tmp$cum_dead)/D>ratio]
  tmp.keep=filter(df.can,province %in% prov.keep)
  if (skip) tmp.agg=NULL 
  else {
    prov.agg=setdiff(unique(df.can$province),prov.keep)
    tmp.agg=filter(df.can,province %in% prov.agg)
    tmp.agg=aggregate.time(tmp.agg,"Canada","RestOfProvinces")
  }
  df.can=rbind(tmp.keep,tmp.agg)
}
df.can=rbind(df.can,df.onlyCan)

######################### Regions of Quebec Regions    

library(jsonlite)
jfile <-"https://ledevoir-coronavirus.herokuapp.com/api/v2/reports/ca/qc"
fromJSON(jfile)->df
reg=df$regions
pr=reg$name
tmp.qc=NULL
for (i in 1:length(pr)){
  tmp.df=reg$data[[i]][,c("date","d","c")]
  names(tmp.df)=c("time","cum_dead","cum_confirm")
  tmp2.df=data.frame(time=tmp.df$time,country=rep("Canada",nrow(tmp.df)),
                     province=rep(pr[i],nrow(tmp.df)),cum_confirm=tmp.df$cum_confirm,
                     cum_dead=tmp.df$cum_dead)
  tmp.qc=rbind(tmp.qc,tmp2.df)
}
paste(as.character(tmp.qc$time),"20",sep="")->b
tmp.qc$time=as.Date(b,format="%m/%d/%Y")
tmp.qc=add.daily(tmp.qc)

end=rev(tmp.qc$time)[1]
tmp=na.omit(arrange(filter(tmp.qc,time==end-2),cum_dead))
if (ratio>0){
  reg.keep=tmp$province[cumsum(tmp$cum_dead)/D>ratio]
  #reg.keep=c("Montréal","Laval","Montérégie")
  tmp.keep=filter(tmp.qc,province %in% reg.keep)
  
  if (skip) tmp.agg=NULL
  else {
    reg.agg=setdiff(unique(tmp.qc$province),reg.keep)
    tmp.agg=filter(tmp.qc,province %in% reg.agg)
    tmp.agg=aggregate.time(tmp.agg,"Canada","RestOfRegions")
  }
  df.qc=rbind(tmp.keep,tmp.agg)
} else df.qc=tmp.qc


########################## Final step

df.can=rbind(df.can,df.qc)


######################################################################################
######################### Data for USA  df.usa #######################################
######################################################################################

data <- read.csv("https://covidtracking.com/api/v1/states/daily.csv", 
                 na.strings = "", fileEncoding = "UTF-8-BOM")

data=data[,c("date","state","positive","death")]
names(data)=c("time","province","cum_confirm","cum_dead")
data$time=as.Date(as.character(data$time),format="%Y%m%d")
data$cum_confirm[is.na(data$cum_confirm)]=0
data$cum_dead[is.na(data$cum_dead)]=0
data=data.frame(data,country=factor("US"))
df.usa=data[,c("time","country","province","cum_confirm","cum_dead")]
df.usa=add.daily(rev.time(df.usa))

################ State selection

if (ratio>0){
  end=rev(sort(unique(df.usa$time)))[1]
  tmp=arrange(filter(df.usa,time==end-2),cum_dead)
  state.keep=tmp$province[cumsum(tmp$cum_dead)/sum(tmp$cum_dead)>ratio]
  tmp.keep=filter(df.usa,province %in% state.keep)
  if (skip) tmp.agg=NULL
  else{ 
    state.agg=setdiff(unique(df.usa$province),state.keep)
    tmp.agg=filter(df.usa,province %in% state.agg)
    tmp.agg=aggregate.time(tmp.agg,"US","RestOfStates")
  }
  df.usa=rbind(tmp.keep,tmp.agg)
}

############ Final step

tmp.us=filter(df.glob,country=="United_States_of_America")
tmp.us$country=factor(rep("US",nrow(tmp.us)))
df.usa=rbind(df.usa,tmp.us)


##########################################################################################
############################# Data for France  df.france #################################
##########################################################################################


############ Sub-regions (départements)

data <- read.csv("https://www.data.gouv.fr/fr/datasets/r/6fadff46-9efd-4c53-942a-54aca783c30c",
                 na.strings = "", fileEncoding = "UTF-8-BOM",sep=";")
data=data[,c("dep","jour","incid_hosp","incid_dc")]
names(data)=c('province','time','confirm','dead')
data$time=as.Date(data$time)
df.france=data.frame(time=data$time,country=rep("France",nrow(data)),
                     province=data$province,confirm=data$confirm,dead=data$dead)

df.france=add.cum(na.omit(df.france))
df.france$province=as.character(df.france$province)

#################### Aggregation by regions

make.pr=function(df,deps,pr){
  df$province[df$province %in% deps] <- pr
  df
}

out=make.pr(df.france,c("03","63","15","43","42","69","01","74","73","38","26","07"),"AuvergneRA")
out=make.pr(out,c("62","59","02","80","60"),"NordPasdeCalais")
out=make.pr(out,c("08","51","10","55","52","54","88","57","67","68"),"AlsaceLorraine")
out=make.pr(out,c("89","58","71","21","70","25","39","90"),"Bourgogne")
out=make.pr(out,c("95","78","92","77","75","91","92","93","94"),"IDF")
out=make.pr(out,c("28","45","41","37","36","18"),"Centre")
out=make.pr(out,c("44","49","85","53","72"),"Loire")
out=make.pr(out,c("76","27","14","61","50"),"Normandie")
out=make.pr(out,c("29","22","56","35"),"Bretagne")
out=make.pr(out,c("79","86","23","87","16","17","24","19","33","47","40","64"),"Aquitaine")
out=make.pr(out,c("46","12","48","30","34","81","82","32","31","65","09","11","66"),"MidiPyrenees")
out=make.pr(out,c("05","04","84","06","83","13"),"Provence")
out=make.pr(out,c("2A","2B"),"Corse")
out=make.pr(out,c("971","972","973","974","976"),"UltraMarin")

out %>% group_by(time,country,province) %>% 
  summarise_at(vars(cum_confirm,cum_dead,confirm,dead),funs(sum(.))) -> out
df.france=as.data.frame(out)

########## Skip small regions

if (ratio>0){
  reg=unique(df.france$province)
  end=rev(sort(unique(df.france$time)))[1]
  tmp=arrange(filter(df.france,time==end-2),cum_dead)
  reg.keep=tmp$province[cumsum(tmp$cum_dead)/sum(tmp$cum_dead)>ratio]
  reg.agg=setdiff(unique(df.france$province),reg.keep)
  tmp.keep=filter(df.france,province %in% reg.keep)
  if (skip) tmp.agg=NULL
  else {
    tmp.agg=filter(df.france,province %in% reg.agg)
    tmp.agg=aggregate.time(tmp.agg,"France","RestOfRegions")
  }
  df.france=rbind(tmp.keep,tmp.agg)
} 

############### Final step

df.france=rbind(df.france,filter(df.glob,country=="France"))

######################################################################
#############################   SAVE DATA ############################
######################################################################

save(df.glob,df.hubei,df.can,df.usa,df.france,file=paste(path,'allData.RData',sep=''))









