• R/O
  • SSH

Tags
No Tags

Frequently used words (click to add to your profile)

javac++androidlinuxc#windowsobjective-ccocoa誰得qtpythonphprubygameguibathyscaphec計画中(planning stage)翻訳omegatframeworktwitterdomtestvb.netdirectxゲームエンジンbtronarduinopreviewer

File Info

Rev. f3a57e7c28663047af562c4658b21c58f2ccd899
Tamanho 10,956 bytes
Hora 2019-08-07 19:25:04
Autor Lorenzo Isella
Mensagem de Log

A code for some advanced text reading and manipulation.

Content

rm(list=ls())

library(tidyverse)
library(scales)
library(stringdist)
library(viridis)


source("/home/lorenzo/myprojects-hg/R-codes/stat_lib.R")


base_year <- 2008


golden_ratio <- golden()

## df_ini <-  read.delim("gi_selected_partners.tsv",stringsAsFactors = FALSE, quote = "", sep = "\t") %>% as_tibble


df_ini <- read_tsv("gi_selected_partners.tsv", locale = readr::locale(encoding = "latin1"))##  %>%
    ## filter(year>=2008)


ceta_exports <- df_ini %>%
    filter(partnerlabel=="Canada") %>%
    select(reporterlabel, partnerlabel, year,productid,
           prod_label, e_value) %>%
    filter(complete.cases(.)) %>%
    mutate(chapter=substrLeft(productid, 2))


ceta_chapters <- ceta_exports %>%
    group_by(partnerlabel, chapter, year) %>%
    summarise(total=sum(e_value)) %>%
    ungroup %>%
    mutate(type=if_else(chapter=="04", "Cheese", "Wine and Spirits"))


gi_exports <- df_ini %>%
    select(reporterlabel, partnerlabel, year,productid,
           prod_label, e_value) %>%
    filter(complete.cases(.)) %>%
    mutate(chapter=substrLeft(productid, 2)) %>%
    group_by(partnerlabel, chapter, year) %>%
    summarise(total=sum(e_value)) %>%
    ungroup %>%
    mutate(type=if_else(chapter=="04", "Cheese", "Wine and Spirits")) 

    to_csv(gi_exports, "gi_exports_to_selected_partners.csv")



gi_exports2 <- df_ini %>%
    select(reporterlabel, partnerlabel, year,productid,
           prod_label, e_value) %>%
    filter(complete.cases(.)) %>%
    mutate(hs4=substrLeft(productid, 4)) %>%
    group_by(partnerlabel,hs4, year) %>%
    summarise(total=sum(e_value)) %>%
    ungroup %>%
    mutate(type=if_else(hs4=="0406", "Cheese",
                        if_else(hs4=="2204", "Wine", "Spirits"))) %>%
    mutate(origin="GI")


wine_cheese <- read_tsv("wines_cheese.tsv") %>%
    select(partnerlabel, year, e_value, productid) %>%
    rename("hs4"="productid", "total"="e_value") %>%
    mutate(type=if_else(hs4=="0406", "Cheese",
                        if_else(hs4=="2204", "Wine", "Spirits"))) %>%
    mutate(origin="All Exports")

                             
exports <- bind_rows(gi_exports2, wine_cheese) %>%
    group_by(origin, type, hs4, partnerlabel) %>%
    mutate(total_base=total/total[year==base_year]) %>%
    ungroup ## %>%
    ## filter(year>=2008)





gpl <- ggplot(data = gi_exports, aes(x  =year, y=total/1e3)) +
       geom_line(size=1.3)+
       geom_point(size=3)+


    
facet_wrap(partnerlabel ~ type, nrow = 2, scales = "free_y"  )+


 

    my_ggplot_theme2("top")+
    theme(axis.text.x = element_text(size=15,angle=0, colour="black", vjust=0.5), axis.text.y = element_text(angle=0) )+

labs(title="EU28 GI Exports in Cheese, Wine and Spirits to Selected Partners")+
    
scale_y_continuous(breaks=pretty_breaks(n=5))+

scale_x_continuous(breaks=seq(1990, 2020, by=2))+

    xlab("Year")+
 ylab("Export Value (MIO €)")






ggsave("gi_exports.pdf", gpl,device = cairo_pdf, w=10*golden_ratio, h=10 )
ggsave("gi_exports.png", gpl,w=10*golden_ratio, h=10 )



my_pal <- c(viridis(n = 3))[1:2]



gpl <- ggplot(data = exports, aes(x  =year, y=total_base,
                                     color=origin, linetype=origin,
                                     shape=origin)) +
       geom_line(size=1.3)+
       geom_point(size=3)+


    
facet_wrap(partnerlabel ~ type, nrow = 4, scales = "free_y"  )+


 

    my_ggplot_theme2("top")+
    ## theme(axis.text.x = element_text(size=15,angle=0, colour="black", vjust=0.5), axis.text.y = element_text(angle=0) )+

 scale_color_manual(NULL, labels=c("GIs and non-GIs","GIs" ),  values=my_pal)+    

 scale_shape_discrete(NULL,solid=T, labels=c("GIs and non-GIs","GIs" ))+ 

 scale_linetype_discrete(NULL, labels=c("GIs and non-GIs","GIs" ))+ 


    
labs(title="EU28 Exports in Cheese, Wine and Spirits to Selected Partners")+
    
scale_y_continuous(breaks=pretty_breaks(n=5),labels = mypercent)+

scale_x_continuous(breaks=seq(1990, 2020, by=2))+

    xlab("Year")+
 ylab(paste("Exports (Baseline: ", base_year,")", sep=""))






ggsave("gi_exports_base.pdf", gpl,device = cairo_pdf, w=10*golden_ratio, h=12 )
ggsave("gi_exports_base.png", gpl,w=10*golden_ratio, h=12 )





















###########################################################################
###########################################################################
###########################################################################
###########################################################################
###########################################################################

gi_codes <- read_csv("GI_codes.csv", col_names = F) %>%
    select(-X2) %>%
    rename("code"="X1")
    
all_names <- read_tsv("nomenclature.tsv",
                      locale = readr::locale(encoding = "latin1")) %>%
    mutate(prod_label=iconv(prod_label, to="ASCII//TRANSLIT")) %>%
    mutate(prod_label=tolower(prod_label)) %>%
    mutate(prod_label=str_replace_all(prod_label, "[[:punct:]]", " "))

gi_names <- all_names %>%
    filter(productid %in% gi_codes$code )


extra <- setdiff(gi_codes$code, gi_names$productid) %>% as_tibble

## codes by Bea which do not exist in the nomenclature

to_csv(extra, "missing_codes.csv")


ceta_GI <- read_csv("ceta2.csv") %>%
    mutate(indication=if_else(!is.na(alternative), alternative, product)) %>%
    mutate(newname=iconv(indication, to="ASCII//TRANSLIT")) %>%
    mutate(newname=tolower(newname)) %>%
    mutate(newname=str_replace_all(newname, "[[:punct:]]", " ")) 



terms <- ceta_GI$newname %>%
    remove_words_vectorized( c("aus","der", "oil", "olive", "del",  "les", "sud", "ouest", "della", "dos", "oli"))



ceta_names<-str_extract_all(terms, '\\w{3,}')





some_words <- map(ceta_names, function(x){

filter(all_names, grepl(paste(x, collapse="|"), prod_label))


})

some_words_length <- map(some_words, nrow)

sel_some <- which(some_words_length!=0)

some_words_non_empty <- some_words[sel_some]




all_words <- map(ceta_names, function(x){

filter(all_names, grepl(paste(x, collapse="&"), prod_label))


})




all_words_length <- map(all_words, nrow)

sel_all <- which(all_words_length!=0)

all_words_non_empty <- all_words[sel_all]



#####################################################

hs_sel <- tibble(hs8=c("04069032", "04069085",
                       "04064010", "04069082",
                       "04069084", "04069013",
                       "04069081", 
                       "04069079", "04069073",
                       "04069075", "04069076",
                       "04064050", "04069061",
                       "04061030",
                       "04069063",
                       "22082026", "22082086",
                       "04069078",
                       "04069023"))



ceta_matching <- all_names %>%
    filter(productid %in% hs_sel$hs8 )

to_csv(ceta_matching, "ceta_matches.csv")


prod_sel <- df_ini %>%
    filter(productid %in% hs_sel$hs8 , partnerlabel=="Canada") %>%
    select(reporterlabel,partnerlabel, year,e_value, prod_label, productid )
    

to_csv(prod_sel, "ceta_matches_exports.csv")

ceta_baseline <- wine_cheese %>%
    filter(## year>=2008,
           partnerlabel=="Canada") %>%
    select(year, total, type) %>%
    filter(type=="Cheese")



ceta_examples <- prod_sel %>%
    filter(productid %in% c("04069023","04064010","04064050","040690",
                            "04069084"  ) ) %>%
    mutate(prod_label=word(prod_label,1)) %>% 
    select(year, e_value, prod_label) %>%
    rename("total"="e_value", "type"="prod_label")



df_ceta <- bind_rows(ceta_baseline, ceta_examples) %>%
    group_by(type) %>%
    mutate(base=total/total[year==base_year]) %>%
    ungroup %>%
    group_by(type) %>%
    calc_growth(total, growth) %>%
    ungroup



n_col=df_ceta$type %>% unique %>% length  +1

## my_pal <- c(viridis(n = n_col))[1:n_col-1]


my_pal <- c(viridis(n = n_col-1))



gpl <- ggplot(data = df_ceta, aes(x  =year, y=base,
                                     color=type, linetype=type,
                                     shape=type)) +
       geom_line(size=1.)+
       geom_point(size=2, stroke=1.7)+


    
## facet_wrap(partnerlabel ~ type, nrow = 4, scales = "free_y"  )+


 

    my_ggplot_theme2("top")+
    ## theme(axis.text.x = element_text(size=15,angle=0, colour="black", vjust=0.5), axis.text.y = element_text(angle=0) )+

 scale_color_manual(NULL, ## labels=c("GIs and non-GIs","GIs" ),
                    values=my_pal)+    

 scale_shape_discrete(NULL,solid=F## , labels=c("GIs and non-GIs","GIs" )
                      )+ 

 scale_linetype_discrete(NULL## , labels=c("GIs and non-GIs","GIs" )
                         )+ 


    
labs(title="EU28 Exports in Cheese, Wine and Spirits to Selected Partners")+
    
scale_y_continuous(breaks=pretty_breaks(n=5),labels = mypercent)+

scale_x_continuous(breaks=seq(1990, 2020, by=2))+

    xlab("Year")+
 ylab(paste("Exports (Baseline: ", base_year,")", sep=""))






ggsave("gi_exports_base_sel.pdf", gpl,device = cairo_pdf, w=7*golden_ratio, h=7 )
ggsave("gi_exports_base_sel.png", gpl,w=7*golden_ratio, h=7 )



temp <- df_ceta %>%
    select(-base) %>%
    rename("exports_1000_eur"="total")

to_csv(temp, "ceta_selection_exports.csv")

gpl <- ggplot(data = df_ceta, aes(x  =year, y=growth,
                                     color=type, linetype=type,
                                     shape=type)) +
       geom_line(size=1.)+
       geom_point(size=2, stroke=1.7)+


    
## facet_wrap(partnerlabel ~ type, nrow = 4, scales = "free_y"  )+


 

    my_ggplot_theme2("top")+
    ## theme(axis.text.x = element_text(size=15,angle=0, colour="black", vjust=0.5), axis.text.y = element_text(angle=0) )+

 scale_color_manual(NULL, ## labels=c("GIs and non-GIs","GIs" ),
                    values=my_pal)+    

 scale_shape_discrete(NULL,solid=F## , labels=c("GIs and non-GIs","GIs" )
                      )+ 

 scale_linetype_discrete(NULL## , labels=c("GIs and non-GIs","GIs" )
                         )+ 


    
labs(title="EU28 Exports in Cheese to Canada")+
    
scale_y_continuous(breaks=pretty_breaks(n=5),labels = mypercent)+

scale_x_continuous(breaks=seq(1990, 2020, by=2))+

    xlab("Year")+
 ylab("Percentage Growth")






ggsave("gi_exports_base_sel_growth.pdf", gpl,device = cairo_pdf, w=7*golden_ratio, h=7 )
ggsave("gi_exports_base_sel_growth.png", gpl,w=7*golden_ratio, h=7 )







## text_dist <- stringdistmatrix(ceta_names$indication, gi_names$prod_label,
##                                method = "dl" ## 'cosine', q = 1
##                               )


## list_similar <- c()

## for ( i in seq(nrow(text_dist))){

##     temp <- text_dist[i , ]

##     sel <- which(temp==max(temp))[1]
##     print("sel is, ")
##     print(sel)

##     list_similar <- c(list_similar, sel)
## }



## my_matches <- all_names[list_similar, ] %>%
##     select(productid,prod_label ) %>%
##     mutate(ceta=ceta_names$indication)




print("So far so good")