Data Cleaning

ENDIREH Data

As mentioned on the previous tab, the ENDIREH data set is the compilation of a national survey answers.

Its objective is to visualize how that metric has been increasing over the years. ENDIREH provides a manual that contains r scripts examples to relate the keys and calculate the experienced violence metric.

The manual was used as reference to build the next r script which collects the information from the ENDIREH Rdata file, creates an object that contains theinformation of the violence separated by “section” (emotional, economic, physical) and interprets the answers based on the keys. That script will output three csv files with the experienced violence metric, segregated by state, for general, emotional and economic violence.

Code

 = 

tiv <- data.frame(TB_SEC_IVaVD)
write.csv(data,"C:/Users/valer/Desktop/tiv_clean.csv", row.names = TRUE)
tiv <- data.frame(TB_SEC_IVaVD)
data_new <- tiv[ , colSums(is.na(tiv)) < nrow(tiv)]
data <- data_new[vapply(data_new, function(x) length(unique(x)) > 1, logical(1L))]

print(TVIV$P1_2)
#Variables that hold the answers we want to calculate
P7_6 <- paste0("P7_6_", 1:18)
P7_8 <- paste0("P7_8_", 1:18) #Academic
P8_9 <- paste0("P8_9_", 1:19)
P8_11 <- paste0("P8_11_", 1:19) # Career
P8_8 <- paste0("P8_8_", 1:9) # Discrimination
P9_1 <- paste0("P9_1_", 1:16)
P9_3 <- paste0("P9_3_", 1:16) # community
P11_1 <- paste0("P11_1_", 1:20) # Family
P14_1 <- paste0("P14_1_", 1:38)
P14_1[c(23, 24, 35:38)] <- paste0(P14_1[c(23, 24, 35:38)], "AB") # Relationship
P14_3 <- paste0("P14_3_", 1:38)
P14_3[c(23, 24, 35:38)] <- paste0(P14_3[c(23, 24, 35:38)], "AB") # Relationship

variables <- c(
    "UPM_DIS", "EST_DIS", "FAC_MUJ", "CVE_ENT", "T_INSTRUM", "P7_1", "P7_2", P7_6, P7_8,
    "P8_1", "P8_2", "P8_3_1_1", "P8_3_1_2", "P8_3_2_1", "P8_3_2_2", "P8_3_2_3",
    "P8_4", "P8_5", P8_9, P8_11, P8_8, P9_1, P9_3, P11_1, "P13_C_1", P14_1, P14_3
)

#collecting question answers
muj <- TB_SEC_IVaVD[, variables]
muj$vtot_lv_con <- ifelse(
(muj$P7_6_1%in%'1' | muj$P7_6_2%in%'1' | muj$P7_6_3%in%'1' | 
 muj$P7_6_4%in%'1' | muj$P7_6_5%in%'1' | muj$P7_6_6%in%'1' | 
 muj$P7_6_7%in%'1' | muj$P7_6_8%in%'1' | muj$P7_6_9%in%'1' | 
 muj$P7_6_10%in%'1' | muj$P7_6_11%in%'1' | muj$P7_6_12%in%'1' | 
 muj$P7_6_13%in%'1' | muj$P7_6_14%in%'1' | muj$P7_6_15%in%'1' | 
 muj$P7_6_16%in%'1' | muj$P7_6_17%in%'1' | muj$P7_6_18%in%'1' |
 muj$P8_3_1_1%in%'1' | muj$P8_3_1_2%in%'1' | muj$P8_3_2_1%in%'1' | 
 muj$P8_3_2_2%in%'1' | muj$P8_3_2_3%in%'1' | muj$P8_8_1%in%'1' | 
 muj$P8_8_2%in%'1' | muj$P8_8_3%in%'1' | muj$P8_8_4%in%'1' | 
 muj$P8_8_5%in%'1' | muj$P8_8_6%in%'1' | muj$P8_8_7%in%'1' | 
 muj$P8_8_8%in%'1' | muj$P8_8_9%in%'1' | muj$P8_9_1%in%'1' | 
 muj$P8_9_2%in%'1' | muj$P8_9_3%in%'1' | muj$P8_9_4%in%'1' | 
 muj$P8_9_5%in%'1' | muj$P8_9_6%in%'1' | muj$P8_9_7%in%'1' | 
 muj$P8_9_8%in%'1' | muj$P8_9_9%in%'1' | muj$P8_9_10%in%'1' | 
 muj$P8_9_11%in%'1' | muj$P8_9_12%in%'1' | muj$P8_9_13%in%'1' | 
 muj$P8_9_14%in%'1' | muj$P8_9_15%in%'1' | muj$P8_9_16%in%'1' | 
 muj$P8_9_17%in%'1' | muj$P8_9_18%in%'1' | muj$P8_9_19%in%'1' | 
 muj$P9_1_1%in%'1' | muj$P9_1_2%in%'1' | muj$P9_1_3%in%'1' | 
 muj$P9_1_4%in%'1' | muj$P9_1_5%in%'1' | muj$P9_1_6%in%'1' | 
 muj$P9_1_7%in%'1' | muj$P9_1_8%in%'1' | muj$P9_1_9%in%'1' | 
 muj$P9_1_10%in%'1' | muj$P9_1_11%in%'1' | muj$P9_1_12%in%'1' | 
 muj$P9_1_13%in%'1' | muj$P9_1_14%in%'1' | muj$P9_1_15%in%'1' | 
 muj$P9_1_16%in%'1' | 
 muj$P11_1_1%in%c( '1','2','3') | muj$P11_1_2%in%c( '1','2','3') | 
 muj$P11_1_3%in%c( '1','2','3') | muj$P11_1_4%in%c( '1','2','3') | 
 muj$P11_1_5%in%c( '1','2','3') | muj$P11_1_6%in%c( '1','2','3') | 
 muj$P11_1_7%in%c( '1','2','3') | muj$P11_1_8%in%c( '1','2','3') | 
 muj$P11_1_9%in%c( '1','2','3') | muj$P11_1_10%in%c( '1','2','3') | 
 muj$P11_1_11%in%c( '1','2','3') | muj$P11_1_12%in%c( '1','2','3') | 
 muj$P11_1_13%in%c( '1','2','3') | muj$P11_1_14%in%c( '1','2','3') | 
 muj$P11_1_15%in%c( '1','2','3') | muj$P11_1_16%in%c( '1','2','3') | 
 muj$P11_1_17%in%c( '1','2','3') | muj$P11_1_18%in%c( '1','2','3') | 
 muj$P11_1_19%in%c( '1','2','3') | muj$P11_1_20%in%c( '1','2','3') | 
 muj$P14_1_1%in%c( '1','2','3') | muj$P14_1_2%in%c( '1','2','3') | 
 muj$P14_1_3%in%c( '1','2','3') | muj$P14_1_4%in%c( '1','2','3') | 
 muj$P14_1_5%in%c( '1','2','3') | muj$P14_1_6%in%c( '1','2','3') | 
 muj$P14_1_7%in%c( '1','2','3') | muj$P14_1_8%in%c( '1','2','3') | 
 muj$P14_1_9%in%c( '1','2','3') | muj$P14_1_10%in%c( '1','2','3') | 
 muj$P14_1_11%in%c( '1','2','3') | muj$P14_1_12%in%c( '1','2','3') | 
 muj$P14_1_13%in%c( '1','2','3') | muj$P14_1_14%in%c( '1','2','3') | 
 muj$P14_1_15%in%c( '1','2','3') | muj$P14_1_16%in%c( '1','2','3') | 
 muj$P14_1_17%in%c( '1','2','3') | muj$P14_1_18%in%c( '1','2','3') | 
 muj$P14_1_19%in%c( '1','2','3') | muj$P14_1_20%in%c( '1','2','3') | 
 muj$P14_1_21%in%c( '1','2','3') | muj$P14_1_22%in%c( '1','2','3') | 
 muj$P14_1_23AB%in%c( '1','2','3') | muj$P14_1_24AB%in%c( '1','2','3') | 
 muj$P14_1_25%in%c( '1','2','3') | muj$P14_1_26%in%c( '1','2','3') | 
 muj$P14_1_27%in%c( '1','2','3') | muj$P14_1_28%in%c( '1','2','3') | 
 muj$P14_1_29%in%c( '1','2','3') | muj$P14_1_30%in%c( '1','2','3') | 
 muj$P14_1_31%in%c( '1','2','3') | muj$P14_1_32%in%c( '1','2','3') | 
 muj$P14_1_33%in%c( '1','2','3') | muj$P14_1_34%in%c( '1','2','3') | 
 muj$P14_1_35AB%in%c( '1','2','3') | muj$P14_1_36AB%in%c( '1','2','3') | 
 muj$P14_1_37AB%in%c( '1','2','3') | muj$P14_1_38AB%in%c( '1','2','3')),1,0)
muj$pob_muj <- 1 
# Defining the sample
disenio <- svydesign(id=~UPM_DIS, strata=~EST_DIS, data=muj, weights=~FAC_MUJ, nest=TRUE) 
# calculating violence estimator
# National
n_vtot_lv_con <- svyratio(~vtot_lv_con, denominator=~pob_muj, disenio, na.rm = TRUE) 
# state
e_vtot_lv_con <- svyby(~vtot_lv_con, denominator=~pob_muj, by=~CVE_ENT, disenio, 
svyratio, na.rm = TRUE) 
# Estimations
# National
est_n_vtot_lv_con <- n_vtot_lv_con[[1]]*100
se_n_vtot_lv_con <- SE(n_vtot_lv_con)*100
cv_n_vtot_lv_con <- cv(n_vtot_lv_con)*100
li_n_vtot_lv_con <- confint(n_vtot_lv_con,level=0.90)[1,1]*100
ls_n_vtot_lv_con <- confint(n_vtot_lv_con,level=0.90)[1,2]*100
# National
est_e_vtot_lv_con <- e_vtot_lv_con[[2]]*100
se_e_vtot_lv_con <- SE(e_vtot_lv_con)*100
cv_e_vtot_lv_con <- cv(e_vtot_lv_con)*100
li_e_vtot_lv_con <- confint(e_vtot_lv_con,level=0.90)[,1]*100
ls_e_vtot_lv_con <- confint(e_vtot_lv_con,level=0.90)[,2]*100
# Defining values by state
state<-c("Estados Unidos Mexicanos", "Aguascalientes", "Baja California", "Baja California Sur",
 "Campeche", "Coahuila de Zaragoza", "Colima", "Chiapas", "Chihuahua", "Ciudad de México", 
 "Durango", "Guanajuato", "Guerrero", "Hidalgo", "Jalisco", "Estado de México", 
 "Michoacán de Ocampo", "Morelos", "Nayarit", "Nuevo León", "Oaxaca", "Puebla", "Querétaro",
 "Quintana Roo", "San Luis Potosí", "Sinaloa", "Sonora", "Tabasco", "Tamaulipas", "Tlaxcala", 
 "Veracruz de Ignacio de la Llave", "Yucatán", "Zacatecas") 
est_vtot_lv_con <- as.data.frame(cbind(state, 
 est_vtot_lv_con= c(est_n_vtot_lv_con, est_e_vtot_lv_con)))
se_vtot_lv_con <- as.data.frame(cbind(state, 
 se_vtot_lv_con= c(se_n_vtot_lv_con, se_e_vtot_lv_con)))
cv_vtot_lv_con <- as.data.frame(cbind(state, 
 cv_vtot_lv_con= c(cv_n_vtot_lv_con, cv_e_vtot_lv_con)))
lim_vtot_lv_con <- as.data.frame(cbind(state, 
 linf_vtot_lv_con= c(li_n_vtot_lv_con, li_e_vtot_lv_con),
 lsup_vtot_lv_con= c(ls_n_vtot_lv_con, ls_e_vtot_lv_con)))
row.names(est_vtot_lv_con) <- row.names(se_vtot_lv_con) <- row.names(cv_vtot_lv_con) <- row.names(lim_vtot_lv_con) <- NULL
list_of_datasets <- list("Estimaciones" = est_vtot_lv_con, 
 "std err" = se_vtot_lv_con, 
 "Coef var" = cv_vtot_lv_con, 
 "Int Conf" = lim_vtot_lv_con)
write.csv(list_of_datasets, file = "violence.csv")

##Calculate emotional violence

P7_6 <- paste0("P7_6_", 1:18)
P7_8 <- paste0("P7_8_", 1:18)
P8_9 <- paste0("P8_9_", 1:19)
P8_11 <- paste0("P8_11_", 1:19)
P8_8 <- paste0("P8_8_", 1:9)
P9_1 <- paste0("P9_1_", 1:16)
P9_3 <- paste0("P9_3_", 1:16)
P11_1 <- paste0("P11_1_", 1:20)
P14_1 <- paste0("P14_1_", 1:38)
P14_1[c(23, 24, 35:38)] <- paste0(P14_1[c(23, 24, 35:38)], "AB")
P14_3 <- paste0("P14_3_", 1:38)
P14_3[c(23, 24, 35:38)] <- paste0(P14_3[c(23, 24, 35:38)], "AB")
variables <- c(
    "UPM_DIS", "EST_DIS", "FAC_MUJ", "CVE_ENT", "T_INSTRUM", "P7_1", "P7_2", P7_6, P7_8,
    "P8_1", "P8_2", "P8_3_1_1", "P8_3_1_2", "P8_3_2_1", "P8_3_2_2", "P8_3_2_3",
    "P8_4", "P8_5", P8_9, P8_11, P8_8, P9_1, P9_3, P11_1, "P13_C_1", P14_1, P14_3
)
#Emotional violence questions
muj$vpsi_lv_con <- ifelse(
 (muj$P7_6_4%in%'1' | muj$P7_6_9%in%'1' | muj$P7_6_13%in%'1' |
 muj$P7_6_16%in%'1' | muj$P7_6_18%in%'1' | muj$P8_9_2 %in%'1' | 
 muj$P8_9_7%in%'1' | muj$P8_9_11%in%'1' | muj$P8_9_12%in%'1' | 
 muj$P8_9_17%in%'1' | muj$P8_9_18%in%'1' | muj$P9_1_2%in%'1' | 
 muj$P9_1_3%in%'1' | muj$P9_1_11%in%'1' | muj$P9_1_15%in%'1' | 
 muj$P11_1_1%in%c( '1','2','3') | muj$P11_1_6%in%c( '1','2','3') | 
 muj$P11_1_7%in%c( '1','2','3') | muj$P11_1_12%in%c( '1','2','3') | 
 muj$P11_1_14%in%c( '1','2','3') | muj$P11_1_17%in%c( '1','2','3') | 
 muj$P11_1_20%in%c( '1','2','3') | muj$P14_1_10%in%c( '1','2','3') | 
 muj$P14_1_11%in%c( '1','2','3') | muj$P14_1_12%in%c( '1','2','3') | 
 muj$P14_1_13%in%c( '1','2','3') | muj$P14_1_14%in%c( '1','2','3') | 
 muj$P14_1_15%in%c( '1','2','3') | muj$P14_1_16%in%c( '1','2','3') | 
 muj$P14_1_17%in%c( '1','2','3') | muj$P14_1_18%in%c( '1','2','3') | 
 muj$P14_1_19%in%c( '1','2','3') | muj$P14_1_20%in%c( '1','2','3') | 
 muj$P14_1_21%in%c( '1','2','3') | muj$P14_1_22%in%c( '1','2','3') | 
 muj$P14_1_23AB%in%c( '1','2','3') | muj$P14_1_24AB%in%c( '1','2','3') | 

muj$P14_1_31%in%c( '1','2','3')),1,0)
muj$pob_muj <- 1 
disenio <- 
 svydesign(id=~UPM_DIS, strata=~EST_DIS, data=muj, weights=~FAC_MUJ, nest=TRUE) 
n_vpsi_lv_con <- svyratio(~vpsi_lv_con, denominator=~pob_muj, disenio, na.rm = TRUE) 

e_vpsi_lv_con <- svyby(~vpsi_lv_con, denominator=~pob_muj, by=~CVE_ENT, disenio, 
 svyratio, na.rm = TRUE) 
# National
est_n_vpsi_lv_con <- n_vpsi_lv_con[[1]]*100
se_n_vpsi_lv_con <- SE(n_vpsi_lv_con)*100
cv_n_vpsi_lv_con <- cv(n_vpsi_lv_con)*100
li_n_vpsi_lv_con <- confint(n_vpsi_lv_con,level=0.90)[1,1]*100
ls_n_vpsi_lv_con <- confint(n_vpsi_lv_con,level=0.90)[1,2]*100
# National
est_e_vpsi_lv_con <- e_vpsi_lv_con[[2]]*100
se_e_vpsi_lv_con <- SE(e_vpsi_lv_con)*100
cv_e_vpsi_lv_con <- cv(e_vpsi_lv_con)*100
li_e_vpsi_lv_con <- confint(e_vpsi_lv_con,level=0.90)[,1]*100
ls_e_vpsi_lv_con <- confint(e_vpsi_lv_con,level=0.90)[,2]*100
# states
state<-c("Estados Unidos Mexicanos", "Aguascalientes", "Baja California", "Baja California Sur", 
 "Campeche", "Coahuila de Zaragoza", "Colima", "Chiapas", "Chihuahua", "Ciudad de México", 
 "Durango", "Guanajuato", "Guerrero", "Hidalgo", "Jalisco", "Estado de México", 
 "Michoacán de Ocampo", "Morelos", "Nayarit", "Nuevo León", "Oaxaca", "Puebla", "Querétaro", 
 "Quintana Roo", "San Luis Potosí", "Sinaloa", "Sonora", "Tabasco", "Tamaulipas", "Tlaxcala", 
 "Veracruz de Ignacio de la Llave", "Yucatán", "Zacatecas") 
est_vpsi_lv_con <- as.data.frame(cbind(state, 
est_vpsi_lv_con= c(est_n_vpsi_lv_con, est_e_vpsi_lv_con)))
se_vpsi_lv_con <- as.data.frame(cbind(state, 
 se_vpsi_lv_con= c(se_n_vpsi_lv_con, se_e_vpsi_lv_con)))
cv_vpsi_lv_con <- as.data.frame(cbind(state, 
 cv_vpsi_lv_con= c(cv_n_vpsi_lv_con, cv_e_vpsi_lv_con)))
lim_vpsi_lv_con <- as.data.frame(cbind(state, 
 linf_vpsi_lv_con= c(li_n_vpsi_lv_con, li_e_vpsi_lv_con),
 lsup_vpsi_lv_con= c(ls_n_vpsi_lv_con, ls_e_vpsi_lv_con)))
row.names(est_vpsi_lv_con) <- row.names(se_vpsi_lv_con) <- row.names(cv_vpsi_lv_con) <- 
 row.names(lim_vpsi_lv_con) <- NULL
list_of_datasets <- list("Estimaciones" = est_vpsi_lv_con, 
 "Error Estandar" = se_vpsi_lv_con)
write.csv(list_of_datasets, file = "emotional_violence.csv")

'tiv <- data.frame(TB_SEC_IVaVD)\nwrite.csv(data,"C:/Users/valer/Desktop/tiv_clean.csv", row.names = TRUE)\ntiv <- data.frame(TB_SEC_IVaVD)\ndata_new <- tiv[ , colSums(is.na(tiv)) < nrow(tiv)]\ndata <- data_new[vapply(data_new, function(x) length(unique(x)) > 1, logical(1L))]\n\nprint(TVIV$P1_2)\n#Variables that hold the answers we want to calculate\nP7_6 <- paste0("P7_6_", 1:18)\nP7_8 <- paste0("P7_8_", 1:18) #Academic\nP8_9 <- paste0("P8_9_", 1:19)\nP8_11 <- paste0("P8_11_", 1:19) # Career\nP8_8 <- paste0("P8_8_", 1:9) # Discrimination\nP9_1 <- paste0("P9_1_", 1:16)\nP9_3 <- paste0("P9_3_", 1:16) # community\nP11_1 <- paste0("P11_1_", 1:20) # Family\nP14_1 <- paste0("P14_1_", 1:38)\nP14_1[c(23, 24, 35:38)] <- paste0(P14_1[c(23, 24, 35:38)], "AB") # Relationship\nP14_3 <- paste0("P14_3_", 1:38)\nP14_3[c(23, 24, 35:38)] <- paste0(P14_3[c(23, 24, 35:38)], "AB") # Relationship\n\nvariables <- c(\n    "UPM_DIS", "EST_DIS", "FAC_MUJ", "CVE_ENT", "T_INSTRUM", "P7_1", "P7_2", P7_6, P7_8,\n    "P8_1", "P8_2", "P8_3_1_1", "P8_3_1_2", "P8_3_2_1", "P8_3_2_2", "P8_3_2_3",\n    "P8_4", "P8_5", P8_9, P8_11, P8_8, P9_1, P9_3, P11_1, "P13_C_1", P14_1, P14_3\n)\n\n#collecting question answers\nmuj <- TB_SEC_IVaVD[, variables]\nmuj$vtot_lv_con <- ifelse(\n(muj$P7_6_1%in%\'1\' | muj$P7_6_2%in%\'1\' | muj$P7_6_3%in%\'1\' | \n muj$P7_6_4%in%\'1\' | muj$P7_6_5%in%\'1\' | muj$P7_6_6%in%\'1\' | \n muj$P7_6_7%in%\'1\' | muj$P7_6_8%in%\'1\' | muj$P7_6_9%in%\'1\' | \n muj$P7_6_10%in%\'1\' | muj$P7_6_11%in%\'1\' | muj$P7_6_12%in%\'1\' | \n muj$P7_6_13%in%\'1\' | muj$P7_6_14%in%\'1\' | muj$P7_6_15%in%\'1\' | \n muj$P7_6_16%in%\'1\' | muj$P7_6_17%in%\'1\' | muj$P7_6_18%in%\'1\' |\n muj$P8_3_1_1%in%\'1\' | muj$P8_3_1_2%in%\'1\' | muj$P8_3_2_1%in%\'1\' | \n muj$P8_3_2_2%in%\'1\' | muj$P8_3_2_3%in%\'1\' | muj$P8_8_1%in%\'1\' | \n muj$P8_8_2%in%\'1\' | muj$P8_8_3%in%\'1\' | muj$P8_8_4%in%\'1\' | \n muj$P8_8_5%in%\'1\' | muj$P8_8_6%in%\'1\' | muj$P8_8_7%in%\'1\' | \n muj$P8_8_8%in%\'1\' | muj$P8_8_9%in%\'1\' | muj$P8_9_1%in%\'1\' | \n muj$P8_9_2%in%\'1\' | muj$P8_9_3%in%\'1\' | muj$P8_9_4%in%\'1\' | \n muj$P8_9_5%in%\'1\' | muj$P8_9_6%in%\'1\' | muj$P8_9_7%in%\'1\' | \n muj$P8_9_8%in%\'1\' | muj$P8_9_9%in%\'1\' | muj$P8_9_10%in%\'1\' | \n muj$P8_9_11%in%\'1\' | muj$P8_9_12%in%\'1\' | muj$P8_9_13%in%\'1\' | \n muj$P8_9_14%in%\'1\' | muj$P8_9_15%in%\'1\' | muj$P8_9_16%in%\'1\' | \n muj$P8_9_17%in%\'1\' | muj$P8_9_18%in%\'1\' | muj$P8_9_19%in%\'1\' | \n muj$P9_1_1%in%\'1\' | muj$P9_1_2%in%\'1\' | muj$P9_1_3%in%\'1\' | \n muj$P9_1_4%in%\'1\' | muj$P9_1_5%in%\'1\' | muj$P9_1_6%in%\'1\' | \n muj$P9_1_7%in%\'1\' | muj$P9_1_8%in%\'1\' | muj$P9_1_9%in%\'1\' | \n muj$P9_1_10%in%\'1\' | muj$P9_1_11%in%\'1\' | muj$P9_1_12%in%\'1\' | \n muj$P9_1_13%in%\'1\' | muj$P9_1_14%in%\'1\' | muj$P9_1_15%in%\'1\' | \n muj$P9_1_16%in%\'1\' | \n muj$P11_1_1%in%c( \'1\',\'2\',\'3\') | muj$P11_1_2%in%c( \'1\',\'2\',\'3\') | \n muj$P11_1_3%in%c( \'1\',\'2\',\'3\') | muj$P11_1_4%in%c( \'1\',\'2\',\'3\') | \n muj$P11_1_5%in%c( \'1\',\'2\',\'3\') | muj$P11_1_6%in%c( \'1\',\'2\',\'3\') | \n muj$P11_1_7%in%c( \'1\',\'2\',\'3\') | muj$P11_1_8%in%c( \'1\',\'2\',\'3\') | \n muj$P11_1_9%in%c( \'1\',\'2\',\'3\') | muj$P11_1_10%in%c( \'1\',\'2\',\'3\') | \n muj$P11_1_11%in%c( \'1\',\'2\',\'3\') | muj$P11_1_12%in%c( \'1\',\'2\',\'3\') | \n muj$P11_1_13%in%c( \'1\',\'2\',\'3\') | muj$P11_1_14%in%c( \'1\',\'2\',\'3\') | \n muj$P11_1_15%in%c( \'1\',\'2\',\'3\') | muj$P11_1_16%in%c( \'1\',\'2\',\'3\') | \n muj$P11_1_17%in%c( \'1\',\'2\',\'3\') | muj$P11_1_18%in%c( \'1\',\'2\',\'3\') | \n muj$P11_1_19%in%c( \'1\',\'2\',\'3\') | muj$P11_1_20%in%c( \'1\',\'2\',\'3\') | \n muj$P14_1_1%in%c( \'1\',\'2\',\'3\') | muj$P14_1_2%in%c( \'1\',\'2\',\'3\') | \n muj$P14_1_3%in%c( \'1\',\'2\',\'3\') | muj$P14_1_4%in%c( \'1\',\'2\',\'3\') | \n muj$P14_1_5%in%c( \'1\',\'2\',\'3\') | muj$P14_1_6%in%c( \'1\',\'2\',\'3\') | \n muj$P14_1_7%in%c( \'1\',\'2\',\'3\') | muj$P14_1_8%in%c( \'1\',\'2\',\'3\') | \n muj$P14_1_9%in%c( \'1\',\'2\',\'3\') | muj$P14_1_10%in%c( \'1\',\'2\',\'3\') | \n muj$P14_1_11%in%c( \'1\',\'2\',\'3\') | muj$P14_1_12%in%c( \'1\',\'2\',\'3\') | \n muj$P14_1_13%in%c( \'1\',\'2\',\'3\') | muj$P14_1_14%in%c( \'1\',\'2\',\'3\') | \n muj$P14_1_15%in%c( \'1\',\'2\',\'3\') | muj$P14_1_16%in%c( \'1\',\'2\',\'3\') | \n muj$P14_1_17%in%c( \'1\',\'2\',\'3\') | muj$P14_1_18%in%c( \'1\',\'2\',\'3\') | \n muj$P14_1_19%in%c( \'1\',\'2\',\'3\') | muj$P14_1_20%in%c( \'1\',\'2\',\'3\') | \n muj$P14_1_21%in%c( \'1\',\'2\',\'3\') | muj$P14_1_22%in%c( \'1\',\'2\',\'3\') | \n muj$P14_1_23AB%in%c( \'1\',\'2\',\'3\') | muj$P14_1_24AB%in%c( \'1\',\'2\',\'3\') | \n muj$P14_1_25%in%c( \'1\',\'2\',\'3\') | muj$P14_1_26%in%c( \'1\',\'2\',\'3\') | \n muj$P14_1_27%in%c( \'1\',\'2\',\'3\') | muj$P14_1_28%in%c( \'1\',\'2\',\'3\') | \n muj$P14_1_29%in%c( \'1\',\'2\',\'3\') | muj$P14_1_30%in%c( \'1\',\'2\',\'3\') | \n muj$P14_1_31%in%c( \'1\',\'2\',\'3\') | muj$P14_1_32%in%c( \'1\',\'2\',\'3\') | \n muj$P14_1_33%in%c( \'1\',\'2\',\'3\') | muj$P14_1_34%in%c( \'1\',\'2\',\'3\') | \n muj$P14_1_35AB%in%c( \'1\',\'2\',\'3\') | muj$P14_1_36AB%in%c( \'1\',\'2\',\'3\') | \n muj$P14_1_37AB%in%c( \'1\',\'2\',\'3\') | muj$P14_1_38AB%in%c( \'1\',\'2\',\'3\')),1,0)\nmuj$pob_muj <- 1 \n# Defining the sample\ndisenio <- svydesign(id=~UPM_DIS, strata=~EST_DIS, data=muj, weights=~FAC_MUJ, nest=TRUE) \n# calculating violence estimator\n# National\nn_vtot_lv_con <- svyratio(~vtot_lv_con, denominator=~pob_muj, disenio, na.rm = TRUE) \n# state\ne_vtot_lv_con <- svyby(~vtot_lv_con, denominator=~pob_muj, by=~CVE_ENT, disenio, \nsvyratio, na.rm = TRUE) \n# Estimations\n# National\nest_n_vtot_lv_con <- n_vtot_lv_con[[1]]*100\nse_n_vtot_lv_con <- SE(n_vtot_lv_con)*100\ncv_n_vtot_lv_con <- cv(n_vtot_lv_con)*100\nli_n_vtot_lv_con <- confint(n_vtot_lv_con,level=0.90)[1,1]*100\nls_n_vtot_lv_con <- confint(n_vtot_lv_con,level=0.90)[1,2]*100\n# National\nest_e_vtot_lv_con <- e_vtot_lv_con[[2]]*100\nse_e_vtot_lv_con <- SE(e_vtot_lv_con)*100\ncv_e_vtot_lv_con <- cv(e_vtot_lv_con)*100\nli_e_vtot_lv_con <- confint(e_vtot_lv_con,level=0.90)[,1]*100\nls_e_vtot_lv_con <- confint(e_vtot_lv_con,level=0.90)[,2]*100\n# Defining values by state\nstate<-c("Estados Unidos Mexicanos", "Aguascalientes", "Baja California", "Baja California Sur",\n "Campeche", "Coahuila de Zaragoza", "Colima", "Chiapas", "Chihuahua", "Ciudad de México", \n "Durango", "Guanajuato", "Guerrero", "Hidalgo", "Jalisco", "Estado de México", \n "Michoacán de Ocampo", "Morelos", "Nayarit", "Nuevo León", "Oaxaca", "Puebla", "Querétaro",\n "Quintana Roo", "San Luis Potosí", "Sinaloa", "Sonora", "Tabasco", "Tamaulipas", "Tlaxcala", \n "Veracruz de Ignacio de la Llave", "Yucatán", "Zacatecas") \nest_vtot_lv_con <- as.data.frame(cbind(state, \n est_vtot_lv_con= c(est_n_vtot_lv_con, est_e_vtot_lv_con)))\nse_vtot_lv_con <- as.data.frame(cbind(state, \n se_vtot_lv_con= c(se_n_vtot_lv_con, se_e_vtot_lv_con)))\ncv_vtot_lv_con <- as.data.frame(cbind(state, \n cv_vtot_lv_con= c(cv_n_vtot_lv_con, cv_e_vtot_lv_con)))\nlim_vtot_lv_con <- as.data.frame(cbind(state, \n linf_vtot_lv_con= c(li_n_vtot_lv_con, li_e_vtot_lv_con),\n lsup_vtot_lv_con= c(ls_n_vtot_lv_con, ls_e_vtot_lv_con)))\nrow.names(est_vtot_lv_con) <- row.names(se_vtot_lv_con) <- row.names(cv_vtot_lv_con) <- row.names(lim_vtot_lv_con) <- NULL\nlist_of_datasets <- list("Estimaciones" = est_vtot_lv_con, \n "std err" = se_vtot_lv_con, \n "Coef var" = cv_vtot_lv_con, \n "Int Conf" = lim_vtot_lv_con)\nwrite.csv(list_of_datasets, file = "violence.csv")\n\n##Calculate emotional violence\n\nP7_6 <- paste0("P7_6_", 1:18)\nP7_8 <- paste0("P7_8_", 1:18)\nP8_9 <- paste0("P8_9_", 1:19)\nP8_11 <- paste0("P8_11_", 1:19)\nP8_8 <- paste0("P8_8_", 1:9)\nP9_1 <- paste0("P9_1_", 1:16)\nP9_3 <- paste0("P9_3_", 1:16)\nP11_1 <- paste0("P11_1_", 1:20)\nP14_1 <- paste0("P14_1_", 1:38)\nP14_1[c(23, 24, 35:38)] <- paste0(P14_1[c(23, 24, 35:38)], "AB")\nP14_3 <- paste0("P14_3_", 1:38)\nP14_3[c(23, 24, 35:38)] <- paste0(P14_3[c(23, 24, 35:38)], "AB")\nvariables <- c(\n    "UPM_DIS", "EST_DIS", "FAC_MUJ", "CVE_ENT", "T_INSTRUM", "P7_1", "P7_2", P7_6, P7_8,\n    "P8_1", "P8_2", "P8_3_1_1", "P8_3_1_2", "P8_3_2_1", "P8_3_2_2", "P8_3_2_3",\n    "P8_4", "P8_5", P8_9, P8_11, P8_8, P9_1, P9_3, P11_1, "P13_C_1", P14_1, P14_3\n)\n#Emotional violence questions\nmuj$vpsi_lv_con <- ifelse(\n (muj$P7_6_4%in%\'1\' | muj$P7_6_9%in%\'1\' | muj$P7_6_13%in%\'1\' |\n muj$P7_6_16%in%\'1\' | muj$P7_6_18%in%\'1\' | muj$P8_9_2 %in%\'1\' | \n muj$P8_9_7%in%\'1\' | muj$P8_9_11%in%\'1\' | muj$P8_9_12%in%\'1\' | \n muj$P8_9_17%in%\'1\' | muj$P8_9_18%in%\'1\' | muj$P9_1_2%in%\'1\' | \n muj$P9_1_3%in%\'1\' | muj$P9_1_11%in%\'1\' | muj$P9_1_15%in%\'1\' | \n muj$P11_1_1%in%c( \'1\',\'2\',\'3\') | muj$P11_1_6%in%c( \'1\',\'2\',\'3\') | \n muj$P11_1_7%in%c( \'1\',\'2\',\'3\') | muj$P11_1_12%in%c( \'1\',\'2\',\'3\') | \n muj$P11_1_14%in%c( \'1\',\'2\',\'3\') | muj$P11_1_17%in%c( \'1\',\'2\',\'3\') | \n muj$P11_1_20%in%c( \'1\',\'2\',\'3\') | muj$P14_1_10%in%c( \'1\',\'2\',\'3\') | \n muj$P14_1_11%in%c( \'1\',\'2\',\'3\') | muj$P14_1_12%in%c( \'1\',\'2\',\'3\') | \n muj$P14_1_13%in%c( \'1\',\'2\',\'3\') | muj$P14_1_14%in%c( \'1\',\'2\',\'3\') | \n muj$P14_1_15%in%c( \'1\',\'2\',\'3\') | muj$P14_1_16%in%c( \'1\',\'2\',\'3\') | \n muj$P14_1_17%in%c( \'1\',\'2\',\'3\') | muj$P14_1_18%in%c( \'1\',\'2\',\'3\') | \n muj$P14_1_19%in%c( \'1\',\'2\',\'3\') | muj$P14_1_20%in%c( \'1\',\'2\',\'3\') | \n muj$P14_1_21%in%c( \'1\',\'2\',\'3\') | muj$P14_1_22%in%c( \'1\',\'2\',\'3\') | \n muj$P14_1_23AB%in%c( \'1\',\'2\',\'3\') | muj$P14_1_24AB%in%c( \'1\',\'2\',\'3\') | \n\nmuj$P14_1_31%in%c( \'1\',\'2\',\'3\')),1,0)\nmuj$pob_muj <- 1 \ndisenio <- \n svydesign(id=~UPM_DIS, strata=~EST_DIS, data=muj, weights=~FAC_MUJ, nest=TRUE) \nn_vpsi_lv_con <- svyratio(~vpsi_lv_con, denominator=~pob_muj, disenio, na.rm = TRUE) \n\ne_vpsi_lv_con <- svyby(~vpsi_lv_con, denominator=~pob_muj, by=~CVE_ENT, disenio, \n svyratio, na.rm = TRUE) \n# National\nest_n_vpsi_lv_con <- n_vpsi_lv_con[[1]]*100\nse_n_vpsi_lv_con <- SE(n_vpsi_lv_con)*100\ncv_n_vpsi_lv_con <- cv(n_vpsi_lv_con)*100\nli_n_vpsi_lv_con <- confint(n_vpsi_lv_con,level=0.90)[1,1]*100\nls_n_vpsi_lv_con <- confint(n_vpsi_lv_con,level=0.90)[1,2]*100\n# National\nest_e_vpsi_lv_con <- e_vpsi_lv_con[[2]]*100\nse_e_vpsi_lv_con <- SE(e_vpsi_lv_con)*100\ncv_e_vpsi_lv_con <- cv(e_vpsi_lv_con)*100\nli_e_vpsi_lv_con <- confint(e_vpsi_lv_con,level=0.90)[,1]*100\nls_e_vpsi_lv_con <- confint(e_vpsi_lv_con,level=0.90)[,2]*100\n# states\nstate<-c("Estados Unidos Mexicanos", "Aguascalientes", "Baja California", "Baja California Sur", \n "Campeche", "Coahuila de Zaragoza", "Colima", "Chiapas", "Chihuahua", "Ciudad de México", \n "Durango", "Guanajuato", "Guerrero", "Hidalgo", "Jalisco", "Estado de México", \n "Michoacán de Ocampo", "Morelos", "Nayarit", "Nuevo León", "Oaxaca", "Puebla", "Querétaro", \n "Quintana Roo", "San Luis Potosí", "Sinaloa", "Sonora", "Tabasco", "Tamaulipas", "Tlaxcala", \n "Veracruz de Ignacio de la Llave", "Yucatán", "Zacatecas") \nest_vpsi_lv_con <- as.data.frame(cbind(state, \nest_vpsi_lv_con= c(est_n_vpsi_lv_con, est_e_vpsi_lv_con)))\nse_vpsi_lv_con <- as.data.frame(cbind(state, \n se_vpsi_lv_con= c(se_n_vpsi_lv_con, se_e_vpsi_lv_con)))\ncv_vpsi_lv_con <- as.data.frame(cbind(state, \n cv_vpsi_lv_con= c(cv_n_vpsi_lv_con, cv_e_vpsi_lv_con)))\nlim_vpsi_lv_con <- as.data.frame(cbind(state, \n linf_vpsi_lv_con= c(li_n_vpsi_lv_con, li_e_vpsi_lv_con),\n lsup_vpsi_lv_con= c(ls_n_vpsi_lv_con, ls_e_vpsi_lv_con)))\nrow.names(est_vpsi_lv_con) <- row.names(se_vpsi_lv_con) <- row.names(cv_vpsi_lv_con) <- \n row.names(lim_vpsi_lv_con) <- NULL\nlist_of_datasets <- list("Estimaciones" = est_vpsi_lv_con, \n "Error Estandar" = se_vpsi_lv_con)\nwrite.csv(list_of_datasets, file = "emotional_violence.csv")\n'

After tidying the data in R, python will be used to continue the cleaning of the data to perform an exploration of it on the next sections.

Although the ENDIREH data is supossed to be mostly “clean” without missing or inconsistent values, the file is being cleaned once more to be sure. Also we are taking the columns or “answers” from different sections of the survey to have a single data set of “questions”

Code

import pandas as pd
import numpy as np

print("Endireh data shape for each section:")
df_xii = pd.read_csv('data/TB_SEC_XII.csv')
print(df_xii.shape)

df_xiv = pd.read_csv('data/TB_SEC_XIV.csv')
print(df_xiv.shape)

df_vi = pd.read_csv('data/TB_SEC_VI.csv')
print(df_vi.shape)

df_iv = pd.read_csv('data/TB_SEC_IV.csv')
print(df_iv.shape)

df = pd.merge(df_xii, df_xiv, on='ID_PER', how='outer')
df = pd.merge(df, df_vi, on='ID_PER', how='outer')
df = pd.merge(df, df_iv, on='ID_PER', how='outer')
print("final shape",df.shape)

df.loc[:,df.apply(pd.Series.nunique) != 1]
df.replace([np.inf, -np.inf], np.nan).dropna(axis=1)

df.head(3)

Endireh data shape for each section:

(110127, 54)

(110127, 224)

(110127, 26)

(110127, 75)

final shape (110127, 376)

	ID_VIV_x	ID_PER	UPM_x	VIV_SEL_x	HOGAR_x	N_REN_x	DOMINIO_x	CVE_ENT_x	NOM_ENT_x	CVE_MUN_x	...	P4_13_3	P4_13_4	P4_13_5	P4_13_6	P4_13_7	FAC_VIV_y	FAC_MUJ_y	ESTRATO_y	UPM_DIS_y	EST_DIS_y
0	100003.01	0100003.01.1.02	100003	1	1	2	U	1	AGUASCALIENTES	1	...	NaN	1.0	NaN	NaN	NaN	113	113	4	1	3
1	100003.02	0100003.02.1.02	100003	2	1	2	U	1	AGUASCALIENTES	1	...	NaN	1.0	NaN	NaN	NaN	113	113	4	1	3
2	100003.03	0100003.03.1.03	100003	3	1	3	U	1	AGUASCALIENTES	1	...	NaN	NaN	NaN	NaN	NaN	113	227	4	1	3

3 rows × 376 columns

After having a single data frame with the data from each section together we will separate unused columns such as identifiers. Identifier columns will be stored for exploration, such as Entity, City, Description of home characteristics and so on.

Code

#Exclude identifiers from the original df

ids = ["ID_VIV_x", "ID_PER", "UPM_x", "VIV_SEL_x", "HOGAR_x", "N_REN_x", "DOMINIO_x", "CVE_ENT_x", "NOM_ENT_x",
       "CVE_MUN_x", "NOM_MUN_x", "T_INSTRUM_x"]

df_ids = df[ids]
print(df_ids.shape)
print(df_ids.head(3))
df_ids.to_csv("data/endireh_ids.csv", index=False)

df = df.drop(ids, axis=1)

(110127, 23)
    ID_VIV_x   ID_VIV_x           ID_PER   UPM_x   UPM_x  VIV_SEL_x  \
0  100003.01  100003.01  0100003.01.1.02  100003  100003          1   
1  100003.02  100003.02  0100003.02.1.02  100003  100003          2   
2  100003.03  100003.03  0100003.03.1.03  100003  100003          3   

   VIV_SEL_x  HOGAR_x  HOGAR_x  N_REN_x  ... CVE_ENT_x CVE_ENT_x  \
0          1        1        1        2  ...         1         1   
1          2        1        1        2  ...         1         1   
2          3        1        1        3  ...         1         1   

        NOM_ENT_x       NOM_ENT_x CVE_MUN_x CVE_MUN_x       NOM_MUN_x  \
0  AGUASCALIENTES  AGUASCALIENTES         1         1  AGUASCALIENTES   
1  AGUASCALIENTES  AGUASCALIENTES         1         1  AGUASCALIENTES   
2  AGUASCALIENTES  AGUASCALIENTES         1         1  AGUASCALIENTES   

        NOM_MUN_x T_INSTRUM_x T_INSTRUM_x  
0  AGUASCALIENTES          A1          A1  
1  AGUASCALIENTES          A1          A1  
2  AGUASCALIENTES          B2          B2  

[3 rows x 23 columns]

To label our dataset we will use questions P14_1_10 and P14_1_14 which asks similar emotional violence experiences as our label for emotional violence. P14_1_10 describes an emotional violent situation and P14_1_14 its frequency.

Code

#Exclude questions that act as key for emotional violence

#Emotional violence Key
p10= df["P14_1_10"]
#Emotional violence Key
p14= df["P14_1_14"]

p10 = p10.replace([1,2,3], 1)
p10 = p10.replace([4,9,np.nan], 0)
p14 = p14.replace([1,2,3], 1)
p14 = p14.replace([4,9,np.nan], 0)

label = p10
for i in range (0,len(label)):
    if (p14[i]) == 1:
        label[i] = 1

df["label"]=label
df.label.value_counts()

df = df.drop(df.query('label == 0').sample(frac=.4).index)

print("label counts ", df.label.value_counts())

df.to_csv("data/endireh_ev.csv",index=False)

label counts  0.0    54940
1.0    18560
Name: label, dtype: int64

Twitter Data

Once the tweet collection phase is finished through the scripts mentioned on the past tab, the csv file is being received to the data cleaning python script “data_cleaning_tweets.py”

This python script is composed by 6 functions. First the data goes through the “clean” function it receives the data and the “vectorizing” function the user decides two implement. Two vectorizing methods can be used CountVectorizer from the sklearn library or TweetTokenizer from the NLTK library, the only difference with TweetTokenizer is that it interprets Natural Language expressions such as faces “:) :P xD” and identifies hashtags, and user tagging words.

After Tokenizing the clean function removes stopwords and characters which is an essential part of the Natural Language Processing pipeline. It also normalizes all users to e.g. @HarryStyles to a general @user

Once the cleaning phase is done, the script contains two lemmatizing functions performed with tools such as “Spacy” and “Freeling” and one stemming function, the “SnowballStemmer” from the NTLK library. The purpose is to explore the result of each function to find the best techniques for preprocessing tweets in spanish. Finally each of the functions output a csv file with the clean information

Code

import pandas as pd
import re
from nltk.stem.snowball import SnowballStemmer

def clean(df):
    email_re = r"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"""
    replace = [
        (r"<a[^>]*>(.*?)</a>", " url"),
        (email_re, "email"),
        (r"@[a-zA-Z0-9_]{0,15}", " user"),
        (r"_[a-zA-Z0-9_]{0,15}", " user"),
        (r"#\w*[a-zA-Z]\w*", " hashtag"),
        (r"(?<=\d),(?=\d)", ""),
        (r"\d+", "numbr"),
        (r"[\t\n\r\*\.\@\,\-\/]", " "),
        (r"\s+", " "),
        (r'[^\w\s]', ''),
        (r'/(.)(?=.*\1)/g', "")
    ]
    for repl in replace:
        clean_text = [re.sub(repl[0], repl[1], str(text))
                      for text in df["text"]]
    df["clean_text"] = clean_text


def lemmatizing_freeling(filename):
    file_origin = r"{0}.txt".format(filename)
    file_lemm = r"{0}_lemm_f.csv".format(filename)

    files = {'file': open(file_origin, 'rb')}
    params = {'outf': 'tagged', 'format': 'json'}
    url = "http://www.corpus.unam.mx/servicio-freeling/analyze.php"
    r = requests.post(url, files=files, params=params)
    obj = r.json()
    lemmatized_set = []
    lemmatized_tweet = ""

    for sentence in obj:
        for word in sentence:
            lemmatized_tweet += (word["lemma"])+" "
    lemmatized_tweet = lemmatized_tweet.replace(" { ", "\n")

    with open(file_lemm, "w") as file_csv:
        file_csv.write(lemmatized_tweet)
    return lemmatized_set


def lemmatizing_spacy(filename, tokenizer):
    file_origin = r"{0}.txt".format(filename)
    with open(file_origin, encoding="utf-8") as f:
        text = f.read()

    doc = tokenizer(text)
    file_lemm = r"{0}_lemm_s.csv".format(filename)

    lemmatized_tweet = ""
    for word in doc:
        lemmatized_tweet += " "+(word.lemma_)+" "
    lemmatized_tweet = lemmatized_tweet.replace(" { ", " ")

    with open(file_lemm, "w") as file_csv:
        file_csv.write(lemmatized_tweet)
    return ' '.join([word.lemma_ for word in doc])


def stemming(clean_text,tokenizer,filename):
    stemmer = SnowballStemmer("spanish")
    file_lemm = r"{0}_stemm.csv".format(filename)
    
    doc = tokenizer(clean_text)
    for word in doc:
        stemmed_tweet += stemmer.stem(word)+" "
        stemmed_tweet = stemmed_tweet.replace(" { ", " ")
    with open(file_lemm, "w") as file_csv:
        file_csv.write(stemmed_tweet)
    file_csv.close()

Conclusion

As conclusion for the cleaning section

Although ENDIREH is a big survey that has been design by INEGI experts over the years, through the manipulation and cleaning of the data it is observed how the emotional violence questions are just a small set of different sections, showing the lack of importance of obtaining information regarding this type of violence.

For example “economic violence” section collects answers related to bullying towards women, which corresponds to the emotional violence data analyzed on this project. The fact that it is considered on a different section can be shared as an improvement for the ENDIREH design.