Rev. | f3a57e7c28663047af562c4658b21c58f2ccd899 |
---|---|
Tamanho | 10,956 bytes |
Hora | 2019-08-07 19:25:04 |
Autor | Lorenzo Isella |
Mensagem de Log | A code for some advanced text reading and manipulation. |
rm(list=ls())
library(tidyverse)
library(scales)
library(stringdist)
library(viridis)
source("/home/lorenzo/myprojects-hg/R-codes/stat_lib.R")
base_year <- 2008
golden_ratio <- golden()
## df_ini <- read.delim("gi_selected_partners.tsv",stringsAsFactors = FALSE, quote = "", sep = "\t") %>% as_tibble
df_ini <- read_tsv("gi_selected_partners.tsv", locale = readr::locale(encoding = "latin1"))## %>%
## filter(year>=2008)
ceta_exports <- df_ini %>%
filter(partnerlabel=="Canada") %>%
select(reporterlabel, partnerlabel, year,productid,
prod_label, e_value) %>%
filter(complete.cases(.)) %>%
mutate(chapter=substrLeft(productid, 2))
ceta_chapters <- ceta_exports %>%
group_by(partnerlabel, chapter, year) %>%
summarise(total=sum(e_value)) %>%
ungroup %>%
mutate(type=if_else(chapter=="04", "Cheese", "Wine and Spirits"))
gi_exports <- df_ini %>%
select(reporterlabel, partnerlabel, year,productid,
prod_label, e_value) %>%
filter(complete.cases(.)) %>%
mutate(chapter=substrLeft(productid, 2)) %>%
group_by(partnerlabel, chapter, year) %>%
summarise(total=sum(e_value)) %>%
ungroup %>%
mutate(type=if_else(chapter=="04", "Cheese", "Wine and Spirits"))
to_csv(gi_exports, "gi_exports_to_selected_partners.csv")
gi_exports2 <- df_ini %>%
select(reporterlabel, partnerlabel, year,productid,
prod_label, e_value) %>%
filter(complete.cases(.)) %>%
mutate(hs4=substrLeft(productid, 4)) %>%
group_by(partnerlabel,hs4, year) %>%
summarise(total=sum(e_value)) %>%
ungroup %>%
mutate(type=if_else(hs4=="0406", "Cheese",
if_else(hs4=="2204", "Wine", "Spirits"))) %>%
mutate(origin="GI")
wine_cheese <- read_tsv("wines_cheese.tsv") %>%
select(partnerlabel, year, e_value, productid) %>%
rename("hs4"="productid", "total"="e_value") %>%
mutate(type=if_else(hs4=="0406", "Cheese",
if_else(hs4=="2204", "Wine", "Spirits"))) %>%
mutate(origin="All Exports")
exports <- bind_rows(gi_exports2, wine_cheese) %>%
group_by(origin, type, hs4, partnerlabel) %>%
mutate(total_base=total/total[year==base_year]) %>%
ungroup ## %>%
## filter(year>=2008)
gpl <- ggplot(data = gi_exports, aes(x =year, y=total/1e3)) +
geom_line(size=1.3)+
geom_point(size=3)+
facet_wrap(partnerlabel ~ type, nrow = 2, scales = "free_y" )+
my_ggplot_theme2("top")+
theme(axis.text.x = element_text(size=15,angle=0, colour="black", vjust=0.5), axis.text.y = element_text(angle=0) )+
labs(title="EU28 GI Exports in Cheese, Wine and Spirits to Selected Partners")+
scale_y_continuous(breaks=pretty_breaks(n=5))+
scale_x_continuous(breaks=seq(1990, 2020, by=2))+
xlab("Year")+
ylab("Export Value (MIO €)")
ggsave("gi_exports.pdf", gpl,device = cairo_pdf, w=10*golden_ratio, h=10 )
ggsave("gi_exports.png", gpl,w=10*golden_ratio, h=10 )
my_pal <- c(viridis(n = 3))[1:2]
gpl <- ggplot(data = exports, aes(x =year, y=total_base,
color=origin, linetype=origin,
shape=origin)) +
geom_line(size=1.3)+
geom_point(size=3)+
facet_wrap(partnerlabel ~ type, nrow = 4, scales = "free_y" )+
my_ggplot_theme2("top")+
## theme(axis.text.x = element_text(size=15,angle=0, colour="black", vjust=0.5), axis.text.y = element_text(angle=0) )+
scale_color_manual(NULL, labels=c("GIs and non-GIs","GIs" ), values=my_pal)+
scale_shape_discrete(NULL,solid=T, labels=c("GIs and non-GIs","GIs" ))+
scale_linetype_discrete(NULL, labels=c("GIs and non-GIs","GIs" ))+
labs(title="EU28 Exports in Cheese, Wine and Spirits to Selected Partners")+
scale_y_continuous(breaks=pretty_breaks(n=5),labels = mypercent)+
scale_x_continuous(breaks=seq(1990, 2020, by=2))+
xlab("Year")+
ylab(paste("Exports (Baseline: ", base_year,")", sep=""))
ggsave("gi_exports_base.pdf", gpl,device = cairo_pdf, w=10*golden_ratio, h=12 )
ggsave("gi_exports_base.png", gpl,w=10*golden_ratio, h=12 )
###########################################################################
###########################################################################
###########################################################################
###########################################################################
###########################################################################
gi_codes <- read_csv("GI_codes.csv", col_names = F) %>%
select(-X2) %>%
rename("code"="X1")
all_names <- read_tsv("nomenclature.tsv",
locale = readr::locale(encoding = "latin1")) %>%
mutate(prod_label=iconv(prod_label, to="ASCII//TRANSLIT")) %>%
mutate(prod_label=tolower(prod_label)) %>%
mutate(prod_label=str_replace_all(prod_label, "[[:punct:]]", " "))
gi_names <- all_names %>%
filter(productid %in% gi_codes$code )
extra <- setdiff(gi_codes$code, gi_names$productid) %>% as_tibble
## codes by Bea which do not exist in the nomenclature
to_csv(extra, "missing_codes.csv")
ceta_GI <- read_csv("ceta2.csv") %>%
mutate(indication=if_else(!is.na(alternative), alternative, product)) %>%
mutate(newname=iconv(indication, to="ASCII//TRANSLIT")) %>%
mutate(newname=tolower(newname)) %>%
mutate(newname=str_replace_all(newname, "[[:punct:]]", " "))
terms <- ceta_GI$newname %>%
remove_words_vectorized( c("aus","der", "oil", "olive", "del", "les", "sud", "ouest", "della", "dos", "oli"))
ceta_names<-str_extract_all(terms, '\\w{3,}')
some_words <- map(ceta_names, function(x){
filter(all_names, grepl(paste(x, collapse="|"), prod_label))
})
some_words_length <- map(some_words, nrow)
sel_some <- which(some_words_length!=0)
some_words_non_empty <- some_words[sel_some]
all_words <- map(ceta_names, function(x){
filter(all_names, grepl(paste(x, collapse="&"), prod_label))
})
all_words_length <- map(all_words, nrow)
sel_all <- which(all_words_length!=0)
all_words_non_empty <- all_words[sel_all]
#####################################################
hs_sel <- tibble(hs8=c("04069032", "04069085",
"04064010", "04069082",
"04069084", "04069013",
"04069081",
"04069079", "04069073",
"04069075", "04069076",
"04064050", "04069061",
"04061030",
"04069063",
"22082026", "22082086",
"04069078",
"04069023"))
ceta_matching <- all_names %>%
filter(productid %in% hs_sel$hs8 )
to_csv(ceta_matching, "ceta_matches.csv")
prod_sel <- df_ini %>%
filter(productid %in% hs_sel$hs8 , partnerlabel=="Canada") %>%
select(reporterlabel,partnerlabel, year,e_value, prod_label, productid )
to_csv(prod_sel, "ceta_matches_exports.csv")
ceta_baseline <- wine_cheese %>%
filter(## year>=2008,
partnerlabel=="Canada") %>%
select(year, total, type) %>%
filter(type=="Cheese")
ceta_examples <- prod_sel %>%
filter(productid %in% c("04069023","04064010","04064050","040690",
"04069084" ) ) %>%
mutate(prod_label=word(prod_label,1)) %>%
select(year, e_value, prod_label) %>%
rename("total"="e_value", "type"="prod_label")
df_ceta <- bind_rows(ceta_baseline, ceta_examples) %>%
group_by(type) %>%
mutate(base=total/total[year==base_year]) %>%
ungroup %>%
group_by(type) %>%
calc_growth(total, growth) %>%
ungroup
n_col=df_ceta$type %>% unique %>% length +1
## my_pal <- c(viridis(n = n_col))[1:n_col-1]
my_pal <- c(viridis(n = n_col-1))
gpl <- ggplot(data = df_ceta, aes(x =year, y=base,
color=type, linetype=type,
shape=type)) +
geom_line(size=1.)+
geom_point(size=2, stroke=1.7)+
## facet_wrap(partnerlabel ~ type, nrow = 4, scales = "free_y" )+
my_ggplot_theme2("top")+
## theme(axis.text.x = element_text(size=15,angle=0, colour="black", vjust=0.5), axis.text.y = element_text(angle=0) )+
scale_color_manual(NULL, ## labels=c("GIs and non-GIs","GIs" ),
values=my_pal)+
scale_shape_discrete(NULL,solid=F## , labels=c("GIs and non-GIs","GIs" )
)+
scale_linetype_discrete(NULL## , labels=c("GIs and non-GIs","GIs" )
)+
labs(title="EU28 Exports in Cheese, Wine and Spirits to Selected Partners")+
scale_y_continuous(breaks=pretty_breaks(n=5),labels = mypercent)+
scale_x_continuous(breaks=seq(1990, 2020, by=2))+
xlab("Year")+
ylab(paste("Exports (Baseline: ", base_year,")", sep=""))
ggsave("gi_exports_base_sel.pdf", gpl,device = cairo_pdf, w=7*golden_ratio, h=7 )
ggsave("gi_exports_base_sel.png", gpl,w=7*golden_ratio, h=7 )
temp <- df_ceta %>%
select(-base) %>%
rename("exports_1000_eur"="total")
to_csv(temp, "ceta_selection_exports.csv")
gpl <- ggplot(data = df_ceta, aes(x =year, y=growth,
color=type, linetype=type,
shape=type)) +
geom_line(size=1.)+
geom_point(size=2, stroke=1.7)+
## facet_wrap(partnerlabel ~ type, nrow = 4, scales = "free_y" )+
my_ggplot_theme2("top")+
## theme(axis.text.x = element_text(size=15,angle=0, colour="black", vjust=0.5), axis.text.y = element_text(angle=0) )+
scale_color_manual(NULL, ## labels=c("GIs and non-GIs","GIs" ),
values=my_pal)+
scale_shape_discrete(NULL,solid=F## , labels=c("GIs and non-GIs","GIs" )
)+
scale_linetype_discrete(NULL## , labels=c("GIs and non-GIs","GIs" )
)+
labs(title="EU28 Exports in Cheese to Canada")+
scale_y_continuous(breaks=pretty_breaks(n=5),labels = mypercent)+
scale_x_continuous(breaks=seq(1990, 2020, by=2))+
xlab("Year")+
ylab("Percentage Growth")
ggsave("gi_exports_base_sel_growth.pdf", gpl,device = cairo_pdf, w=7*golden_ratio, h=7 )
ggsave("gi_exports_base_sel_growth.png", gpl,w=7*golden_ratio, h=7 )
## text_dist <- stringdistmatrix(ceta_names$indication, gi_names$prod_label,
## method = "dl" ## 'cosine', q = 1
## )
## list_similar <- c()
## for ( i in seq(nrow(text_dist))){
## temp <- text_dist[i , ]
## sel <- which(temp==max(temp))[1]
## print("sel is, ")
## print(sel)
## list_similar <- c(list_similar, sel)
## }
## my_matches <- all_names[list_similar, ] %>%
## select(productid,prod_label ) %>%
## mutate(ceta=ceta_names$indication)
print("So far so good")