Skip to content

Commit

Permalink
Cleaning columns
Browse files Browse the repository at this point in the history
I've refined the download function, adding headers and cleaning columns (e.g., using toupper()).  There are new registration values in this (although this test file is 2017 provisional) - should help with matching.
  • Loading branch information
wengraf committed Jan 29, 2021
1 parent bafb1a2 commit 6368439
Showing 1 changed file with 62 additions and 2 deletions.
64 changes: 62 additions & 2 deletions R/carmatch.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,74 @@ if(!file.exists(f)) {
print(f)
# every single car type

read.csv(
f, fileEncoding = "UTF-16", sep = "\t", header = FALSE
data <- read.csv(
f, fileEncoding = "UTF-16", sep = "\t", header = TRUE
)
# fails:
# readr::read_tsv(f, locale = readr::locale(encoding="UTF-16"))

names <- c("ID",
"Country",
"Manufacturer_pooling",
"Vehicle_family_identification_number",
"Manufacturer_name_EU",
"Manufacturer_name_OEM",
"Manufacturer_name_MS",
"Type_Approval_Number",
"Type",
"Variant",
"Version",
"Make",
"Commercial_name",
"Category_type_approved",
"Category_vehicle_registered",
"Mass_kg",
"WLTP_Mass",
"NEDC_CO2",
"WLTP_CO2",
"Wheelbase",
"SteeringAxle",
"OtherAxle",
"Fuel_type",
"Fuel_mode",
"Engine_capacity",
"Engine_power",
"Electric_consumpt",
"Innovative_tech",
"NEDC_Emissions_tech",
"WLTP_Emissions_tech",
"Deviation_factor",
"Verification_factor",
"Total_new_registrations")
colnames(data) <- names
data$Manufacturer_pooling[data$Manufacturer_pooling == ""] <- NA
data$Manufacturer_pooling <- as.factor(data$Manufacturer_pooling)
data$Vehicle_family_identification_number[data$Vehicle_family_identification_number == ""] <- NA
data$Manufacturer_name_EU <- toupper(data$Manufacturer_name_EU)
data$Manufacturer_name_OEM[data$Manufacturer_name_OEM == ""] <- NA
data$Manufacturer_name_OEM <- as.factor(toupper(base::trimws(data$Manufacturer_name_OEM, which = c("both"))))
data$Manufacturer_name_MS[data$Manufacturer_name_MS == ""] <- NA
data$Manufacturer_name_MS <- as.factor(toupper(base::trimws(data$Manufacturer_name_MS, which = c("both"))))
data$Type_Approval_Number[data$Type_Approval_Number == ""] <- NA
data$Make[data$Make == ""] <- NA
data$Make <- as.factor(toupper(base::trimws(data$Make, which = c("both"))))
data$Commercial_name[data$Commercial_name == ""] <- NA
data$Commercial_name <- as.factor(toupper(base::trimws(data$Commercial_name, which = c("both"))))
data$Category_type_approved[data$Category_type_approved == ""] <- NA
data$Category_type_approved <- as.factor(toupper(base::trimws(data$Category_type_approved, which = c("both"))))
data$Category_vehicle_registered <- NULL
data$Fuel_type[data$Fuel_type == ""] <- NA
data$Fuel_type <- as.factor(toupper(base::trimws(data$Fuel_type, which = c("both"))))
data$Fuel_mode[data$Fuel_mode == ""] <- NA
data$Fuel_mode <- toupper(base::trimws(data$Fuel_mode, which = c("both")))
data$Innovative_tech[data$Innovative_tech == ""] <- NA
data

}
d_raw = cm_get()
system.time(saveRDS(d_raw, "d_raw_euco2.Rds"))
# 35s on fast computer! 40 MB

summary(as.factor(d_raw$Fuel_type))


0 comments on commit 6368439

Please sign in to comment.