• R/O
  • SSH

Commit

Tags
No Tags

Frequently used words (click to add to your profile)

javac++androidlinuxc#windowsobjective-ccocoa誰得qtpythonphprubygameguibathyscaphec計画中(planning stage)翻訳omegatframeworktwitterdomtestvb.netdirectxゲームエンジンbtronarduinopreviewer

Commit MetaInfo

Revisão2aeecd2bac13f8a430891b9f42f01bedcc43d02c (tree)
Hora2024-09-18 04:08:05
AutorLorenzo Isella <lorenzo.isella@gmai...>
CommiterLorenzo Isella

Mensagem de Log

A cleanup of the code to merge the various parquet files.

Mudança Sumário

Diff

diff -r 69e147994e03 -r 2aeecd2bac13 R-codes/merge_tam_parquet.R
--- a/R-codes/merge_tam_parquet.R Tue Sep 17 20:11:43 2024 +0200
+++ b/R-codes/merge_tam_parquet.R Tue Sep 17 21:08:05 2024 +0200
@@ -11,16 +11,16 @@
1111
1212 add_tctf <- 1
1313
14-read_all <- 0
14+## read_all <- 1
1515
1616
1717
18-tctf <- open_dataset("TCTF_cases.csv", format="csv")
18+tctf <- open_dataset("../input/csv_files/TCTF_cases.csv", format="csv")
1919
2020
21-if (read_all==1){
21+## if (read_all==1){
2222
23-df <- open_dataset("./input_all/", unify_schemas = TRUE) |>
23+df <- open_dataset("../input/parquet-files/", unify_schemas = TRUE) |>
2424 select(everything()) ### I need to select the columns in any case.
2525
2626
@@ -32,10 +32,13 @@
3232
3333 }
3434
35+
36+remove_files_with_pattern("../output/*parquet")
37+
3538 write_dataset(
3639 df,
3740 format = "parquet",
38- path = "./data_output_all/",
41+ path = "../output/",
3942 basename_template="tam_finale_20-03-2024-{i}.parquet" ,
4043 max_rows_per_file = 5e5
4144 )
@@ -48,82 +51,89 @@
4851
4952 print("I start writing the csv files")
5053
54+remove_files_with_pattern("../output/flat_files/*csv")
55+
5156 write_dataset(
5257 df,
5358 format = "csv",
54- path = "./data_output_all/flat_files/",
59+ path = "../output/flat_files/",
5560 basename_template="tam_finale_20-03-2024-{i}.csv" ,
5661 max_rows_per_file = 5e5,
5762 max_open_files=5)
5863
64+system("csvstack ../output/flat_files/*csv > ../output/flat_files/tam_complete.csv")
5965
60-system("gzip ./data_output_all/flat_files/*csv")
6166
67+
68+system("gzip -f -9 ../output/flat_files/tam_complete.csv")
69+
70+remove_files_with_pattern("../output/flat_files/*csv")
71+
6272
6373 }
6474
6575
6676
6777
68-} else{
69-
70-df <- open_dataset("./input_all/extra", unify_schemas = TRUE) |>
71- select(everything()) ### I need to select the columns in any case.
72-
73-if (add_tctf==1){
74-
75- df <- df |>
76- left_join(y=tctf, by=c("case_reference"="sa_case_number")) |>
77- mutate(is_tctf_case=if_else(is.na(is_tctf_case), "No", is_tctf_case))
78-}
78+## } else{
7979
80-write_dataset(
81- df,
82- format = "parquet",
83- path = "./data_output_all/TAM_ES_RO_SI/parquet/",
84- basename_template="tam_tctf_20-03-2024-{i}.parquet" ,
85- max_rows_per_file = 5e5
86-)
80+## df <- open_dataset("../input_all/extra", unify_schemas = TRUE) |>
81+## select(everything()) ### I need to select the columns in any case.
8782
88-print("writing the TAM compressed csv file")
83+## if (add_tctf==1){
8984
90-write_csv_arrow(
91- df,"./data_output_all/TAM_ES_RO_SI/flat_file/transparency_tctf-20-03-2024.csv.gz")
85+## df <- df |>
86+## left_join(y=tctf, by=c("case_reference"="sa_case_number")) |>
87+## mutate(is_tctf_case=if_else(is.na(is_tctf_case), "No", is_tctf_case))
88+## }
89+
90+## write_dataset(
91+## df,
92+## format = "parquet",
93+## path = "../data_output_all/TAM_ES_RO_SI/parquet/",
94+## basename_template="tam_tctf_20-03-2024-{i}.parquet" ,
95+## max_rows_per_file = 5e5
96+## )
97+
98+## print("writing the TAM compressed csv file")
99+
100+## write_csv_arrow(
101+## df,"../data_output_all/TAM_ES_RO_SI/flat_file/transparency_tctf-20-03-2024.csv.gz")
92102
93103
94104
95105
96-### now we do Poland
106+## ### now we do Poland
97107
98-df<- open_dataset("./input_all/poland/", unify_schemas = TRUE) |>
99- select(everything()) ### I need to select the columns in any case.
108+## df<- open_dataset("../input_all/poland/", unify_schemas = TRUE) |>
109+## select(everything()) ### I need to select the columns in any case.
100110
101111
102- if (add_tctf==1) {
103-
104- df <- df |>
105- left_join(y=tctf, by=c("case_reference"="sa_case_number")) |>
106- mutate(is_tctf_case=if_else(is.na(is_tctf_case), "No", is_tctf_case))
107-
108-}
112+## if (add_tctf==1) {
109113
110-write_dataset(
111- df,
112- format = "parquet",
113- path = "./data_output_all/poland/arrow/",
114- basename_template="poland_tctf_20-03-2024-{i}.parquet" ,
115- max_rows_per_file = 5e5
116-)
114+## df <- df |>
115+## left_join(y=tctf, by=c("case_reference"="sa_case_number")) |>
116+## mutate(is_tctf_case=if_else(is.na(is_tctf_case), "No", is_tctf_case))
117117
118-print("writing Poland compressed csv file")
118+## }
119119
120-write_csv_arrow(
121- df,"./data_output_all/poland/flat_file/poland_tctf-20-03-2024.csv.gz")
120+## write_dataset(
121+## df,
122+## format = "parquet",
123+## path = "../data_output_all/poland/arrow/",
124+## basename_template="poland_tctf_20-03-2024-{i}.parquet" ,
125+## max_rows_per_file = 5e5
126+## )
127+
128+## print("writing Poland compressed csv file")
129+
130+## write_csv_arrow(
131+## df,"../data_output_all/poland/flat_file/poland_tctf-20-03-2024.csv.gz")
122132
123133
124134
125135
126-}
136+## }
127137
128138
129139