ABOUT

The code aims to provide transparency for the merging and treatment of variables for the Multidimensional Measures of Militarization (m3) Dataset (version 1). Please reach out to in case of any doubts or feedback. The dataset contains information for 159 countries spanning the years 1990 to 2020.

SETUP

#.rs.restartR() to restart session in R

rm(list = ls())
cat("\014") #for clearing console
check.packages <- function(pkg){
  new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
  if (length(new.pkg))
    install.packages(new.pkg, dependencies = TRUE,
                     repos = "https://cran.rstudio.com")
  sapply(pkg, require, character.only = TRUE)
}
#installing the packages 
check.packages(c("tidyverse","countrycode","readxl",
                 "WDI","haven","ggplot2","lubridate",
                 "car","olsrr","writexl","DBI","RSQLite"))
##   tidyverse countrycode      readxl         WDI       haven     ggplot2 
##        TRUE        TRUE        TRUE        TRUE        TRUE        TRUE 
##   lubridate         car       olsrr     writexl         DBI     RSQLite 
##        TRUE        TRUE        TRUE        TRUE        TRUE        TRUE
## Normalization function
norm <- function(v) {
    max <- max(v, na.rm = TRUE)
    min <- min(v, na.rm = TRUE)
    return((v - min) / (max - min))
}

PREPARING DATA

Loading the files

#The first file loads the detailed equipment per country

#conscription ----

#define the path
path_conscription<-"Conscription_for_merge.xlsx"

#read the data
# this dataset has Northen Cyprus
conscription_data <- read_excel(path_conscription,1)  

#m3 dataset -----

#define the path
path_m3<-"PRM3_master.xlsx"

#read the data
m3_data <- read_excel(path_m3,1)

#gmi dataset 2021 -----

path_gmi<-"GMI_2021.xlsx"

gmi_data <- read_excel(path_gmi,1)

##milpol dataset ----

path_milpol<-"02MilPol_final_updated_june2023.dta"

milpol_data<-read_dta(path_milpol)

#economic_m3 dataset ---

ecom3_data <- read_dta("Rdataset_3M_filtered_clean.dta")

#economic_m3 dataset NEW ----

path_ecom3new<-"ecom3_data_new.xlsx"
ecom3_data_new <- read_excel(path_ecom3new,1)
#this dataset is the same as ecom3_data after running all the cleaning stated here but manually changing some values

#dissagreggated gmi dataset ----

dsg_gmi_data<-read_csv("disaggregate_gmi_2021.csv")

Standarize column names

#conscription ----

conscription_names<-c("country_iso"="ISO",
                      "year"="Year",
                      "country.name"="Country",
                      "com_mil_serv"="ComMilServ",
                      "com_mil_serv_gen"="ComMilServ_Gen",
                      "com_mil_serv_sur_min" ="ComMilServ_Dur_min",
                      "com_mil_serv_dur_max" = "ComMilServ_Dur_max",
                      "com_mil_serv_dur_min"="ComMilServ_Dur_min",
                      "alt_civ_serv"="AltCivServ",
                      "mobilisation"="Mobilisation")   

#com_mil_serv_sur_min should not exist, however will drop it at the end (not to mess w/rest of the code)

conscription_data<-conscription_data%>%
  select(all_of(conscription_names))

#m3 dataset -----

m3_names<-c("country.name"="country_text",
            "country_iso"="country_id",
            "country_cowcode"="cowcode",
            "year","mil_origin","mil_leader",
            "mil_mod","mil_veto","mil_repress",
            "mil_repress_count","mil_impun")

m3_data<-m3_data%>%
  select(all_of(m3_names))

#gmi dataset  -----

gmi_names<-c("country_iso","country.name","region",
             "gmi_2000","gmi_2001","gmi_2002",
             "gmi_2003","gmi_2004","gmi_2005",
             "gmi_2006","gmi_2007","gmi_2008",
             "gmi_2009","gmi_2010","gmi_2011",
             "gmi_2012","gmi_2013","gmi_2014",
             "gmi_2015","gmi_2016","gmi_2017",
             "gmi_2018","gmi_2019","gmi_2020")

gmi_data<-gmi_data%>%
  select(all_of(gmi_names))

##milpol dataset ----

milpol_names <- c("year" = "year",
                  "country.name" = "countryname_english",
                   "milpol_cripre" ="MilPol_CriPre" ,
                   "max_conf_milpol_cripre"="Max_Conf_MilPol_CriPre",
                   "p_a_milpol_cripre"="p_a_MilPol_CriPre",
                   "milpol_lawenf" = "MilPol_LawEnf" ,
                    "max_conf_milpol_lawenf" ="Max_Conf_MilPol_LawEnf" ,
                   "p_a_milpol_lawenf" ="p_a_MilPol_LawEnf" ,
                    "milpol_peapre" ="MilPol_PeaPre" ,
                    "max_conf_milpol_peapre" = "Max_Conf_MilPol_PeaPre",
                    "p_a_milpol_peapre" = "p_a_MilPol_PeaPre",
                    "milpol_dummy" ="MilPol_Dummy" ,
                   "milpol_sum" = "MilPol_Sum"
                  )


milpol_data<-milpol_data%>%
  select(all_of(milpol_names))

#economic_m3 dataset ---

ecom3_names<- c("state_abb" = "StateAbb",
                "ccode", "country.name" = "StateName", 
                "year", "pop_total" = "PopTotal",  
                "overmillion_bool"="OverMillion",
                "military_bool" = "Military",
                "dup", "dup2", "mil_eco_dummy", 
                "mil_eco_inf", "mil_eco_own", "mil_eco_share",
                "mil_eco_dom", "mil_eco_small", "mil_eco_medium",
                "mil_eco_large", "mil_eco_vlarge")

ecom3_data<-ecom3_data %>%
  select(all_of(ecom3_names))

#disaggregated gmi ----

dsg_gmi_names <- c("bicc_code", "country.name" = "country_en",
                   "region", "year", "milex_gdp", "milex_health",
                   "milpara_pop", "milpara_phy", "reserve_pop",
                   "weapons_pop", "milex_gdp_norm", "milex_health_norm",
                   "milpara_pop_norm","milpara_phy_norm",
                   "reserve_pop_norm","weapons_pop_norm")

dsg_gmi_data<-dsg_gmi_data %>%
  select(all_of(dsg_gmi_names)) %>%
  select(-c("bicc_code","region")) #"weapons_pop" include again

Setting the types

conscription_data<-type_convert(conscription_data)
m3_data<-type_convert(m3_data)

Setting country codes

#conscription ----

#define a vector of codes that do not appear in the countrycode package
#matched unambiguously: Czechoslovakia, Kosovo, Republic of Serbia and Montenero, Yemen, North, Yemen, South, Yugoslavia

cust_name_iso <- c("Kosovo" = "XKX","Czechoslovakia" ="CSK",
                   "Yugoslavia" = "YUG", "Yemen, North" = "YAR",
                   "Yemen, South" = "YMD", "Republic of Serbia and Montenegro" = "SCG",
                   "Soviet Union"="SUN", "Northern Cyprus" = "CYN") 

#conscription -----
conscription_data<- conscription_data %>% 
  mutate(country_iso3c=countrycode(country.name,origin='country.name',destination='iso3c', custom_match = cust_name_iso))%>%
  select(country_iso3c, everything())


#m3 -----
##matched unambiguously:  Czechoslovakia, Kosovo

m3_data<-m3_data %>%
  mutate(country.name=ifelse((year>1992 & year<=2006 & country.name=="Yugoslavia/Serbia"),"Republic of Serbia and Montenegro",country.name)) %>%
  mutate(country.name = ifelse(year<=1992 & country.name=="Yugoslavia/Serbia","Yugoslavia",country.name)) %>%
  mutate(country.name=ifelse(year>=2007 & country.name=="Yugoslavia/Serbia","Republic of Serbia",country.name))


m3_data<- m3_data %>% 
  mutate(country_iso3c=countrycode(country.name,origin='country.name',destination='iso3c', custom_match = cust_name_iso))%>%
  select(country_iso3c, everything())

#gmi -----

gmi_data<-gmi_data%>%
  mutate(country_iso3c=countrycode(country.name,origin='country.name',destination='iso3c', custom_match = cust_name_iso))%>%
  select(country_iso3c, everything())

#milpol ----

milpol_data<-milpol_data%>%
  mutate(country_iso3c=countrycode(country.name,origin='country.name',destination='iso3c', custom_match = cust_name_iso))%>%
  select(country_iso3c, everything())

#economic_m3 ----

ecom3_data<-ecom3_data%>%
  mutate(country.name=ifelse((year>1992 & year<=2006 & country.name=="Yugoslavia/Serbia"),"Republic of Serbia and Montenegro",country.name)) %>%
  mutate(country.name = ifelse(year<=1992 & country.name=="Yugoslavia/Serbia","Yugoslavia",country.name)) %>%
  mutate(country.name=ifelse(year>=2007 & country.name=="Yugoslavia/Serbia","Republic of Serbia",country.name))

ecom3_data<-ecom3_data%>%
  mutate(country_iso3c=countrycode(country.name,origin='country.name',destination='iso3c', custom_match = cust_name_iso))%>%
  select(country_iso3c, everything())

#disaggregated gmi ----

dsg_gmi_data<-dsg_gmi_data%>%
  mutate(country_iso3c=countrycode(country.name,origin='country.name',destination='iso3c', custom_match = cust_name_iso))%>%
  select(country_iso3c, everything())

Adjusting format

#gmi -----

#pivot to long
cols.num <- gmi_names[-c(1,2,3)]

gmi_data<-gmi_data%>%
    tidyr::pivot_longer(
      cols = all_of(cols.num), 
      names_to = "year", 
      names_prefix = "gmi_",
      values_to = "gmi" 
    )

#convert year to numeric
gmi_data<-gmi_data%>%
  mutate(year=as.numeric(year))

Cleaning the data

#conscription ----

#conscription_data<-conscription_data%>% 
#  mutate(across(where(is.numeric), ~na_if(., -999))) 

conscription_data<-conscription_data%>%
  mutate(no_military_service=ifelse(com_mil_serv_gen==-888,1,0)) %>%
  mutate(no_civil_service=ifelse(alt_civ_serv==-888,1,0))


#m3 ----

#m3_data<-m3_data%>% 
#  mutate(across(where(is.numeric), ~na_if(., -999))) 

# cleaning new ecom3data set

ecom3_data_new<-ecom3_data_new %>%
  select(-c("countryname_standard"))

# Convert "NA" (as text) to actual NA in character columns
ecom3_data_new <- ecom3_data_new %>%
  mutate(across(where(is.character), ~na_if(., "NA")))

Identify country names per dataset

#This will later help verify the merging is correct

#gmi ----
gmi_data<-gmi_data%>%
  rename(.,countryname_gmi=country.name) %>%
  rename(.,country_iso_gmi=country_iso)

#conscription ----

conscription_data<-conscription_data%>%
  rename(.,countryname_conscription=country.name)%>%
  rename(.,country_iso_conscription=country_iso)

#m3 ----

m3_data<-m3_data%>%
  rename(.,countryname_m3=country.name)%>%
  rename(.,country_iso_m3=country_iso)

milpol_data<-milpol_data%>%
  rename(.,countryname_milpol=country.name)

#economic m3 ----

ecom3_data<-ecom3_data %>%
  rename(.,countryname_ecom3 = country.name)

#dissagregated gmi----

dsg_gmi_data<-dsg_gmi_data %>%
  rename(.,countryname_dsg_gmi = country.name)

Solving country continuity

#For conscription data ----
#Serbia & Montenegro (1993-2006) ; Serbia (2007-2021)

conscription_selection<-conscription_data%>%
  filter(str_detect(countryname_conscription,"Serbia"))

conscription_selection <- conscription_selection %>%
  filter((countryname_conscription == "Republic of Serbia" & year >= 2007) | (countryname_conscription == "Republic of Serbia and Montenegro" & year <= 2006))%>%
  mutate(countryname_conscription="Republic of Serbia")  %>%
  mutate(country_iso3c="SRB")

conscription_data <- conscription_data %>%
  filter(!str_detect(countryname_conscription, "Serbia")) 

conscription_data <- rbind(conscription_data, conscription_selection)

#Yemen before 1995
columns_to_set_to_na <- c("com_mil_serv", 
                          "com_mil_serv_gen", 
                          "com_mil_serv_sur_min", 
                          "com_mil_serv_dur_max",
                          "com_mil_serv_dur_min", 
                          "alt_civ_serv", 
                          "mobilisation",
                          "no_military_service",
                           "no_civil_service")

conscription_data <- conscription_data %>% 
  mutate(across(all_of(columns_to_set_to_na), ~ifelse(year < 1995 & countryname_conscription == "Yemen", NA, .)))


#For M3 data ----

m3_selection<-m3_data%>%
  filter(str_detect(countryname_m3,"Serbia"))

m3_selection<-m3_selection%>%
  filter((countryname_m3 == "Republic of Serbia" & year >= 2007) | (countryname_m3 == "Republic of Serbia and Montenegro" & year <= 2006))%>%
  mutate(countryname_m3="Republic of Serbia")  %>%
  mutate(country_iso3c="SRB")

m3_data <- m3_data %>%
  filter(!str_detect(countryname_m3, "Serbia"))

m3_data <- rbind(m3_data, m3_selection)

#Yemens

m3_variables <- c("country_iso_m3","country_cowcode","mil_origin", "mil_leader",
                  "mil_mod", "mil_veto", "mil_repress", 
                  "mil_repress_count", "mil_impun")

m3_yemens_df<- expand.grid(
  countryname_m3 = c("Yemen, North", "Yemen, South"),
  year = 1990:2021
)

m3_yemens_df<-cbind(m3_yemens_df, setNames( lapply(m3_variables, function(x) x=NA), m3_variables))

m3_yemens_df$country_iso3c <- ifelse(m3_yemens_df$countryname_m3 == "Yemen, North", "YAR",
                                         ifelse(m3_yemens_df$countryname_m3 == "Yemen, South", "YMD", NA))

m3_data <- rbind(m3_data, m3_yemens_df)

#For GMI ----
#Nothing needed 

#ForMilpol----
#Solving Yugoslavia/Serbia issue for 1991 and 1992

milpol_selection<-milpol_data%>%
  filter(str_detect(countryname_milpol,"Serbia"))

milpol_selection<-milpol_selection%>%
  mutate(countryname_milpol=ifelse(countryname_milpol == "Republic of Serbia" & year <= 1992, "Yugoslavia" ,countryname_milpol)) %>%
  mutate(country_iso3c=ifelse(countryname_milpol == "Yugoslavia", "YUG",country_iso3c))
         
milpol_data <- milpol_data %>%
  filter(!str_detect(countryname_milpol, "Serbia"))

milpol_data <- rbind(milpol_data, milpol_selection)
    

#Solving Yemen before 1991
#"Yemen, North" = "YAR", "Yemen, South" = "YMD"

columns_to_update <- c("milpol_cripre", "max_conf_milpol_cripre", "p_a_milpol_cripre",
                       "milpol_lawenf", "max_conf_milpol_lawenf", "p_a_milpol_lawenf",
                       "milpol_peapre",
                       "max_conf_milpol_peapre",
                       "p_a_milpol_peapre", "milpol_dummy", "milpol_sum")

milpol_data <- milpol_data %>% 
  mutate(across(all_of(columns_to_update), ~ifelse(countryname_milpol=="Yemen" & year < 1991, NA, .)))


milpol_yemens_df<- expand.grid(
  countryname_milpol = c("Yemen, North", "Yemen, South"),
  year = 1990:2021
)

milpol_yemens_df<-cbind(milpol_yemens_df, setNames( lapply(columns_to_update, function(x) x=NA), columns_to_update))

milpol_yemens_df$country_iso3c <- ifelse(milpol_yemens_df$countryname_milpol == "Yemen, North", "YAR",
                                         ifelse(milpol_yemens_df$countryname_milpol == "Yemen, South", "YMD", NA))

milpol_data <- rbind(milpol_data, milpol_yemens_df)


#For economics M3 data ----

ecom3_selection<-ecom3_data%>%
  filter(str_detect(countryname_ecom3,"Serbia"))

ecom3_selection<-ecom3_selection%>%
  filter((countryname_ecom3 == "Republic of Serbia" & year >= 2007) | (countryname_ecom3 == "Republic of Serbia and Montenegro" & year <= 2006))%>%
  mutate(countryname_ecom3="Republic of Serbia")  %>%
  mutate(country_iso3c="SRB")

ecom3_data <- ecom3_data %>%
  filter(!str_detect(countryname_ecom3, "Serbia"))

ecom3_data <- rbind(ecom3_data, ecom3_selection)


#For dissagregated GMI ----
#Nothing needed 

Min-max standarization

#economic m3 -----

# Variables to standarize
vars_to_standardize <- c("mil_eco_inf", "mil_eco_own", "mil_eco_share",
                         "mil_eco_dom", "mil_eco_small", "mil_eco_medium",
                         "mil_eco_large", "mil_eco_vlarge"
                         )

# Mutate() to create new variables with "_std" suffix and apply norm() function -defined before-
ecom3_data <- ecom3_data %>%
  mutate(across(all_of(vars_to_standardize), ~norm(.), .names = "{col}_std"))


# STD for ecom3_data_new

# Convert specified variables to numeric
ecom3_data_new <- ecom3_data_new %>%
  mutate(across(all_of(vars_to_standardize), as.numeric))

ecom3_data_new <- ecom3_data_new %>%
  mutate(across(all_of(vars_to_standardize), ~norm(.), .names = "{col}_std"))

MERGING

Setting reference countries

#Setting simple reference table for countries and years 

#Select the countries ----
countryyears_df<- m3_data%>%
  select(countryname_standard=countryname_m3,country_iso3c)  %>%
  distinct(country_iso3c, .keep_all = TRUE)

#Add country_cown and country_cowc ----

custom_cowc=c()
custom_cown=c()

countryyears_df<- countryyears_df %>% 
  mutate(country_cowc=countrycode(country_iso3c,origin='iso3c',destination='cowc', custom_match = custom_cowc)) %>%
  mutate(country_cown =countrycode(country_iso3c,origin='iso3c',destination='cown', custom_match = custom_cown))

# Make cown an integer variable
countryyears_df$country_cown <- as.integer(countryyears_df$country_cown)

#Set the reference years ----
years <- 1990:2021

#create the dataframe ----
countryyears_df <- countryyears_df %>%
  crossing(year = years)

#Add region  ----
countryyears_df<- countryyears_df %>% 
  mutate(region_wb=countrycode(country_iso3c,origin='iso3c',destination='region')) %>%
  mutate(region_wb=replace_na(region_wb,"Europe & Central Asia"))

#Add continent ----
countryyears_df<- countryyears_df %>% 
  mutate(continent=countrycode(country_iso3c,origin='iso3c',destination='continent')) %>%
  mutate(continent=ifelse(country_iso3c=="SUN","Europe/Asia",continent)) %>%
  mutate(continent=replace_na(continent,"Europe"))

Do the merging

# Merge by left join

merged_df<-countryyears_df %>%
  left_join(conscription_data,by=c("country_iso3c","year")) %>%
  left_join(m3_data, by=c("country_iso3c","year")) %>%
  left_join(gmi_data,by=c("country_iso3c","year"))  %>%
  left_join(milpol_data,by=c("country_iso3c","year")) %>%
  left_join(ecom3_data_new,by=c("country_iso3c","year")) %>%   #using new corrected dataset
  left_join(dsg_gmi_data, by=c("country_iso3c","year"))

# Make year an integer variable
merged_df$year <- as.integer(merged_df$year)

CLEANING

Clean and arrange

# un.name.en: United Nations English country name
# Names without match: ABW: Aruba, CSK: Czechoslovakia, CUW: Curacao, GGY: Guernsey, GRL: Greenland, HKG: Hong Kong, IMN: Isle of Man, JEY: Jersey, MAC:Macao, SCG: Republic of Serbia and Montenegro, SXM: Sint Maarten, TWN: Republic of China (Taiwan),  VAT: Vatican,  XKX: Republic of Kosovo,  YAR: North Yemen,  YMD: South Yemen, YUG: Yugoslavia

un_countrynames=c("YUG"="Socialist Federal Republic of Yugoslavia",
                  "SUN"=" Union of Soviet Socialist Republics",
                  "YMD" = "Democratic Yemen", 
                  "YAR" ="Yemen Arab Republic", 
                  "TUR" = "Türkiye",
                  "CSK"="Czechoslovakia", 
                  "CZE" = "Czechia")


merged_df<-merged_df%>%
  mutate(countryname_un=countrycode(country_iso3c,origin='iso3c',destination='un.name.en',custom_match = un_countrynames))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `countryname_un = countrycode(...)`.
## Caused by warning:
## ! Some values were not matched unambiguously: TWN, XKX
#Rearrange the columns
merged_df <- merged_df %>%
  select(starts_with("countryname_"), starts_with("country_iso_"), everything())

#Order the columns
merged_df<-merged_df%>%
   arrange(-desc(country_iso3c))

#Rename variable

merged_df <- merged_df %>%
  rename(mil_police = milpol_dummy)

Partner dataset clean

# Die Daten der Sowjetunion werden für das Jahr 1992 auf Russland übertragen.
merged_df$com_mil_serv <- ifelse(merged_df$countryname_standard == "Russia" & merged_df$year == 1992, "1", merged_df$com_mil_serv)
merged_df$com_mil_serv_gen <- ifelse(merged_df$countryname_standard == "Russia" & merged_df$year == 1992, "NA", merged_df$com_mil_serv_gen)
merged_df$com_mil_serv_sur_min <- ifelse(merged_df$countryname_standard == "Russia" & merged_df$year == 1992, "24", merged_df$com_mil_serv_sur_min)
merged_df$com_mil_serv_dur_max <- ifelse(merged_df$countryname_standard == "Russia" & merged_df$year == 1992, "24", merged_df$com_mil_serv_dur_max)
merged_df$com_mil_serv_dur_min <- ifelse(merged_df$countryname_standard == "Russia" & merged_df$year == 1992, "24", merged_df$com_mil_serv_dur_min)
merged_df$alt_civ_serv <- ifelse(merged_df$countryname_standard == "Russia" & merged_df$year == 1992, "NA", merged_df$alt_civ_serv)
merged_df$mobilisation <- ifelse(merged_df$countryname_standard == "Russia" & merged_df$year == 1992, "0", merged_df$mobilisation)
merged_df$no_military_service <- ifelse(merged_df$countryname_standard == "Russia" & merged_df$year == 1992, "NA", merged_df$no_military_service)
merged_df$no_civil_service <- ifelse(merged_df$countryname_standard == "Russia" & merged_df$year == 1992, "NA", merged_df$no_civil_service)

# Drop observations based on conditions
merged_df <- merged_df[!(merged_df$countryname_standard == "Yemen, North" & merged_df$year > 1990), ]
merged_df <- merged_df[!(merged_df$countryname_standard == "Yemen, South" & merged_df$year > 1990), ]
merged_df <- merged_df[!(merged_df$countryname_standard == "Yugoslavia" & merged_df$year > 1992), ]
merged_df <- merged_df[!(merged_df$countryname_standard == "Czechoslovakia" & merged_df$year > 1992), ]
merged_df <- merged_df[!(merged_df$countryname_standard == "Soviet Union" & merged_df$year > 1991), ]

merged_df <- merged_df[!(merged_df$countryname_standard == "Yemen" & merged_df$year < 1991), ]
merged_df <- merged_df[!(merged_df$countryname_standard == "Republic of Serbia" & merged_df$year < 1993), ]
merged_df <- merged_df[!(merged_df$countryname_standard == "Bosnia and Herzegovina" & merged_df$year < 2006), ]
merged_df <- merged_df[!(merged_df$countryname_standard == "Croatia" & merged_df$year < 1992), ]
merged_df <- merged_df[!(merged_df$countryname_standard == "Slovenia" & merged_df$year < 1993), ]
merged_df <- merged_df[!(merged_df$countryname_standard == "Macedonia" & merged_df$year < 1993), ]
merged_df <- merged_df[!(merged_df$countryname_standard == "Czech Republic" & merged_df$year < 1993), ]
merged_df <- merged_df[!(merged_df$countryname_standard == "Slovakia" & merged_df$year < 1993), ]
merged_df <- merged_df[!(merged_df$countryname_standard == "Lithuania" & merged_df$year < 1993), ]
merged_df <- merged_df[!(merged_df$countryname_standard == "Georgia" & merged_df$year < 1992), ]
merged_df <- merged_df[!(merged_df$countryname_standard == "Estonia" & merged_df$year < 1992), ]
merged_df <- merged_df[!(merged_df$countryname_standard == "Latvia" & merged_df$year < 1992), ]
merged_df <- merged_df[!(merged_df$countryname_standard == "Ukraine" & merged_df$year < 1992), ]
merged_df <- merged_df[!(merged_df$countryname_standard == "Moldova" & merged_df$year < 1993), ]
merged_df <- merged_df[!(merged_df$countryname_standard == "Kyrgyzstan" & merged_df$year < 1993), ]
merged_df <- merged_df[!(merged_df$countryname_standard == "Uzbekistan" & merged_df$year < 1993), ]
merged_df <- merged_df[!(merged_df$countryname_standard == "Tajikistan" & merged_df$year < 1994), ]
merged_df <- merged_df[!(merged_df$countryname_standard == "Armenia" & merged_df$year < 1992), ]
merged_df <- merged_df[!(merged_df$countryname_standard == "Azerbaijan" & merged_df$year < 1992), ]
merged_df <- merged_df[!(merged_df$countryname_standard == "Turkmenistan" & merged_df$year < 1993), ]
merged_df <- merged_df[!(merged_df$countryname_standard == "Belarus" & merged_df$year < 1993), ]
merged_df <- merged_df[!(merged_df$countryname_standard == "Russia" & merged_df$year < 1992), ]
merged_df <- merged_df[!(merged_df$countryname_standard == "Kazakhstan" & merged_df$year < 1993), ]
merged_df <- merged_df[!(merged_df$countryname_standard == "Eritrea" & merged_df$year < 1994), ]
merged_df <- merged_df[!(merged_df$countryname_standard == "Kuwait" & merged_df$year < 1992), ]
merged_df <- merged_df[!(merged_df$countryname_standard == "Namibia" & merged_df$year < 1991), ]
merged_df <- merged_df[!(merged_df$countryname_standard == "Somalia" & merged_df$year < 2009), ]
merged_df <- merged_df[!(merged_df$countryname_standard == "Cyprus" & merged_df$year < 2003), ]
merged_df <- merged_df[!(merged_df$countryname_standard == "Ethiopia" & merged_df$year %in% 1991:1996), ]
merged_df <- merged_df[!(merged_df$countryname_standard == "Gabon" & merged_df$year < 1991), ]
merged_df <- merged_df[!(merged_df$countryname_standard == "Equatorial Guinea" & merged_df$year < 2009), ]
merged_df <- merged_df[!(merged_df$countryname_standard == "Haiti" & merged_df$year %in% 1995:2017), ]
merged_df <- merged_df[!(merged_df$countryname_standard == "Qatar" & merged_df$year < 2006), ]
merged_df <- merged_df[!(merged_df$countryname_standard == "Timor-Leste" & merged_df$year < 2007), ]
merged_df <- merged_df[!(merged_df$countryname_standard == "Bahrain" & merged_df$year < 2007), ]
merged_df <- merged_df[!(merged_df$countryname_standard == "Angola" & merged_df$year < 1992), ]
merged_df <- merged_df[!(merged_df$countryname_standard == "Eswatini" & merged_df$year < 1998), ]
merged_df <- merged_df[!(merged_df$countryname_standard == "Guinea-Bissau" & merged_df$year < 1992), ]
merged_df <- merged_df[!(merged_df$countryname_standard == "South Sudan" & merged_df$year < 2012), ]

Recode specific values

# Find the rows that meet the conditions
rows_to_reset <- merged_df$country_iso3c == "PER" & merged_df$year %in% c(2012, 2013, 2014, 2015, 2016)

# Set the values to 0 for the selected rows
merged_df$mil_leader[rows_to_reset] <- 0

Change variable and drop

# Create a named vector for renaming
change_variables <- c("milpol_crime"="milpol_cripre",
                  "p_a_milpol_crime" = "p_a_milpol_cripre",
                  "milpol_law"="milpol_lawenf",
                  "p_a_milpol_law"="p_a_milpol_lawenf",
                  "milpol_peace"="milpol_peapre",
                  "p_a_milpol_peace"="p_a_milpol_peapre",
                  "milex_healthexp"="milex_health",
                  "pers_to_pop"="milpara_pop",  
                  "pers_to_phy"="milpara_phy", 
                  "hwi_norm"="weapons_pop_norm",
                  "hwi"="weapons_pop")

# Rename
merged_df <- merged_df %>%
  rename(., all_of(change_variables))


# Drop variables

drop_variables <- c("com_mil_serv_sur_min","no_military_service","no_civil_service","gmi",
                    "overmillion_bool","military_bool") #"dup","dup2

merged_df <- merged_df %>%
  select(-one_of(drop_variables))

# Variables to drop in version 1 but include in version 2

drop_variables_v1 <- c("milpol_sum","mobilisation","mil_eco_inf")

merged_df <- merged_df %>%
  select(-one_of(drop_variables_v1))

Transform NA

convert_to_numeric <- c("com_mil_serv",
                        "com_mil_serv_gen",
                        "com_mil_serv_dur_max",
                        "com_mil_serv_dur_min",
                        "alt_civ_serv",
                        #"mobilisation",  #dropped for version 1
                        "pop_total")

merged_df <- merged_df %>%
  mutate(across(all_of(convert_to_numeric), ~ifelse(. == "NA", NA, .))) %>%
  mutate(across(all_of(convert_to_numeric), as.numeric))

merged_df <- merged_df %>%
  mutate_if(is.numeric, ~ifelse(. == -999, NA, .))

FILE BUILDING

Create CSV file

#Select columns

merged_df<-merged_df%>%
  select(-c("countryname_m3","countryname_gmi","country_iso_conscription",
            "country_iso_m3","country_iso_gmi","region","countryname_milpol",
            "countryname_conscription","country_cowcode","countryname_dsg_gmi")) 
            #this are of the old ecom3 dataset: "countryname_ecom3","state_abb", #"ccode"

#Arrange columns
merged_df<-merged_df%>%
  select(year,country_iso3c,everything())

#Filter rows without un-name

merged_df<-merged_df%>%
  filter(!is.na(countryname_un))

#Filter the years 
merged_df<-merged_df%>%
  filter(year!="2021")

#create a file to verify manually

name_of_file=str_c("merged_military_dataset",sep = "_",Sys.Date(),lubridate::hour(Sys.time()),lubridate::minute(Sys.time()),".csv")

write_csv(merged_df,name_of_file)

Create SQLite file

db_file=str_c("merged_military_dataset",sep = "_",Sys.Date(),lubridate::hour(Sys.time()),lubridate::minute(Sys.time()),".sqlite")

# Create an SQLite connection
con <- dbConnect(RSQLite::SQLite(), dbname = db_file)

# Specify column types
#column_types <- c(year = "INTEGER", everything())

# Write the data frame to an SQLite table
dbWriteTable(con, "m3_dataset", merged_df, overwrite = TRUE) #, col_types = column_types)

# Close the connection
dbDisconnect(con)

COLINEARITY TEST

Prepare collinearlity algorithm

#evaluation turned false, turn on when needed

collinearity_tests <- function(response_vars, data) {
  vif_tol_results <- list()
  eigen_cindex_results <- list()

  for (response_var in response_vars) {
    # Fit the linear regression model
    formula <- paste(response_var, "~ .")
    lm_model <- lm(formula, data = data)

    # Calculate VIF and tolerance
    vif_tol_results[[response_var]] <- olsrr::ols_vif_tol(lm_model)

    # Calculate eigenvalues and condition index
    eigen_cindex_results[[response_var]] <- olsrr::ols_eigen_cindex(lm_model)
  }

  return(list(vif_tol_results, eigen_cindex_results))
}
#evaluation turned false, turn on when needed
#change names and get rid of -888 and -999

# Non standarized vars
variables_list_a <- c("milex_gdp","milex_health","milpara_pop","milpara_phy","reserve_pop",
                      "weapons_pop_norm", #this is the HWI
                      "mil_leader", "mil_origin", "mil_mod", "mil_impun", "mil_veto",
                      "mil_repress", "mil_police", "com_mil_serv", "mil_eco_dummy", "mil_eco_inf")



collinearity_df_a<-merged_df[variables_list_a]

collinearity_df_a$com_mil_serv <- as.numeric(collinearity_df_a$com_mil_serv)

# Standarized vars

variables_list_b <- c("milex_gdp_norm","milex_health_norm","milpara_pop_norm","milpara_phy_norm",
                      "reserve_pop_norm",
                      "weapons_pop_norm", #this is the HWI
                      "mil_leader", "mil_origin", "mil_mod", "mil_impun", "mil_veto",
                      "mil_repress", "mil_police", "com_mil_serv", "mil_eco_dummy", "mil_eco_inf_std")

collinearity_df_b<-merged_df[variables_list_b]

collinearity_df_b$com_mil_serv <- as.numeric(collinearity_df_a$com_mil_serv)

Run the tests

#evaluation turned false, turn on when needed

#Perform the collinearity test

#non standarized vars
results <- collinearity_tests(response_vars = variables_list_a, data = collinearity_df_a)

#standarized vars
results_b <- collinearity_tests(response_vars = variables_list_b, data = collinearity_df_b)

Create the file

#evaluation turned false, turn on when needed

file_name=str_c("collinearity_results_non",sep = "_",Sys.Date(),lubridate::hour(Sys.time()),lubridate::minute(Sys.time()),".xlsx")

# Create an Excel workbook
wb <- openxlsx::createWorkbook()

# Loop through the first list in results
for (i in 1:length(results[[1]])) {
  # Get the name of the variable for the sheet
  var_name <- names(results[[1]])[i]
  
  # Create a new sheet for each entry in the first list
  sheet_name <- paste("viftol", var_name, sep = "_")
  openxlsx::addWorksheet(wb, sheetName =sheet_name)   #sheetName = paste("Sheet_1_", i, sep = "")
  
  # Convert the list of results to a data frame and transpose it
  results_df <- as.data.frame(results[[1]][[i]])
  
  # Write the results of the variable to the sheet
  openxlsx::writeData(wb, sheet = i, x = results_df, startCol = 1, startRow = 1)
}

# Loop through the second list in results
for (i in 1:length(results[[2]])) {
  # Get the name of the variable for the sheet
  var_name <- names(results[[2]])[i]
  
  
  # Create a new sheet with the variable name and index
  sheet_name <- paste("eigenindex", var_name, sep = "_")
  openxlsx::addWorksheet(wb, sheetName =sheet_name)
  
  # Convert the list of results to a data frame and transpose it
  results_df <- as.data.frame(results[[2]][[i]])
  
  # Write the results of the variable to the sheet
  openxlsx::writeData(wb, sheet = i + length(results[[1]]), x = results_df, startCol = 1, startRow = 1)
}

# Save the Excel workbook to a file
openxlsx::saveWorkbook(wb, file_name)
#evaluation turned false, turn on when needed


file_name=str_c("collinearity_results_std",sep = "_",Sys.Date(),lubridate::hour(Sys.time()),lubridate::minute(Sys.time()),".xlsx")

# Create an Excel workbook
wb <- openxlsx::createWorkbook()

# Loop through the first list in results
for (i in 1:length(results_b[[1]])) {
  # Get the name of the variable for the sheet
  var_name <- names(results_b[[1]])[i]
  
  # Create a new sheet for each entry in the first list
  sheet_name <- paste("viftol", var_name, sep = "_")
  openxlsx::addWorksheet(wb, sheetName =sheet_name)   #sheetName = paste("Sheet_1_", i, sep = "")
  
  # Convert the list of results to a data frame and transpose it
  results_df <- as.data.frame(results_b[[1]][[i]])
  
  # Write the results of the variable to the sheet
  openxlsx::writeData(wb, sheet = i, x = results_df, startCol = 1, startRow = 1)
}

# Loop through the second list in results
for (i in 1:length(results_b[[2]])) {
  # Get the name of the variable for the sheet
  var_name <- names(results_b[[2]])[i]
  
  
  # Create a new sheet with the variable name and index
  sheet_name <- paste("eigenindex", var_name, sep = "_")
  openxlsx::addWorksheet(wb, sheetName =sheet_name)
  
  # Convert the list of results to a data frame and transpose it
  results_df <- as.data.frame(results_b[[2]][[i]])
  
  # Write the results of the variable to the sheet
  openxlsx::writeData(wb, sheet = i + length(results_b[[1]]), x = results_df, startCol = 1, startRow = 1)
}

# Save the Excel workbook to a file
openxlsx::saveWorkbook(wb, file_name)