From f5bdfa8c1cf554470a29678835b58d16d985b628 Mon Sep 17 00:00:00 2001
From: nicolefindstar <yue.wu@berkeley.edu>
Date: Thu, 1 Aug 2024 17:04:30 +0200
Subject: [PATCH] Updated EIG, formatted Stata, added SPSS, R and Python

---
 .../EIG-indicator-tidyverse.R                 | 46 +++++++++
 .../EIG-indicator.do                          | 97 ++++++++++---------
 .../EIG-indicator.py                          | 39 ++++++++
 .../EIG-indicator.sps                         | 81 ++++++++++++++++
 4 files changed, 218 insertions(+), 45 deletions(-)
 create mode 100644 Indicators/Engagement-in-income-generating-activities/EIG-indicator-tidyverse.R
 create mode 100644 Indicators/Engagement-in-income-generating-activities/EIG-indicator.py
 create mode 100644 Indicators/Engagement-in-income-generating-activities/EIG-indicator.sps

diff --git a/Indicators/Engagement-in-income-generating-activities/EIG-indicator-tidyverse.R b/Indicators/Engagement-in-income-generating-activities/EIG-indicator-tidyverse.R
new file mode 100644
index 0000000..f45acd4
--- /dev/null
+++ b/Indicators/Engagement-in-income-generating-activities/EIG-indicator-tidyverse.R
@@ -0,0 +1,46 @@
+#------------------------------------------------------------------------------
+#                          WFP Standardized Scripts
+#    Engagement in Income Generation Activities (EIG) Calculation
+#------------------------------------------------------------------------------
+
+# This script calculates the Engagement in Income Generation Activities (EIG)
+# using standard variable names and sample data.
+# Detailed guidelines can be found in the WFP documentation.
+
+library(tidyverse)
+library(labelled)
+library(expss)
+
+# Add sample data
+data <- read_csv("~/GitHub/RAMResourcesScripts/Static/EIG_Sample_Survey.csv")
+
+# Rearrange variable names to ensure consistency in the dataset
+data <- data %>%
+  rename_with(~ gsub("/", "", .), starts_with("v"))
+
+# Loop to account for up to 9 training types
+for (i in 1:9) {
+  training_col <- paste0("PTrainingTypes", i)
+  data[[training_col]] <- ifelse(is.na(data[[training_col]]), 0, data[[training_col]])
+}
+
+# Calculate engagement in income generation activities
+data <- data %>%
+  mutate(across(starts_with("PPostTrainingEmpl"), as.numeric),
+         across(starts_with("PPostTrainingIncome"), as.numeric),
+         PostTrainingEngagement = pmax(PPostTrainingEmpl, PPostTrainingIncome, na.rm = TRUE),
+         PTrainingPart = rowSums(select(., starts_with("PTrainingTypes"))))
+
+# Calculate household level variables
+household_data <- data %>%
+  group_by(household_id) %>%
+  summarise(PostTrainingEngagement = sum(PostTrainingEngagement, na.rm = TRUE),
+            PTrainingPartNb = sum(PTrainingPart, na.rm = TRUE))
+
+household_data <- household_data %>%
+  mutate(EIG = PostTrainingEngagement / PTrainingPartNb)
+
+# Summary statistics for full sample
+summary(household_data$EIG)
+
+# End of Scripts
\ No newline at end of file
diff --git a/Indicators/Engagement-in-income-generating-activities/EIG-indicator.do b/Indicators/Engagement-in-income-generating-activities/EIG-indicator.do
index dbb6c71..e6fd7c7 100644
--- a/Indicators/Engagement-in-income-generating-activities/EIG-indicator.do
+++ b/Indicators/Engagement-in-income-generating-activities/EIG-indicator.do
@@ -1,66 +1,73 @@
-***Create Engagement in Income Generation Activities [EIG] using standard variables names 
+*------------------------------------------------------------------------------
+*                          WFP Standardized Scripts
+*    Engagement in Income Generation Activities (EIG) Calculation
+*------------------------------------------------------------------------------
 
-/* Import static sample data */
+* This script calculates the Engagement in Income Generation Activities (EIG)
+* using standard variable names and sample data.
+* Detailed guidelines can be found in the WFP documentation.
+
+* Import static sample data
 import delim using "../../Static/EIG_Sample_Survey.csv", clear case(preserve) bindquotes(strict) varn(1)
 
-/* Rearrange variable names and codes to ensure consistency in the dataset*/
-* in particular, variables within repeats are imported with progressive integer names (v1-v2-v3...) as they would be all assigned the same name otherwise
-* the loop below names variables as [Variablename]+[_number of option]+[_number of repetition]
+* Rearrange variable names and codes to ensure consistency in the dataset
+* In particular, variables within repeats are imported with progressive integer names (v1-v2-v3...) as they would be all assigned the same name otherwise
+* The loop below names variables as [Variablename]+[_number of option]+[_number of repetition]
 
 qui su RepeatPAsstEIG_count
-loc RepeatNum=`r(max)'
+loc RepeatNum = `r(max)'
 
-local num1=1
+local num1 = 1
 foreach var of varlist v* {
-	local `var'_lab: variable label `var'
-	loc `var'_lab=subinstr(`"``var'_lab'"',"/","",.)
-	di `"``var'_lab'"'
-	cap rename `var' ``var'_lab'
-	if _rc==110 {
-		cap rename ``var'_lab' ``var'_lab'_1
-		cap rename `var' ``var'_lab'_`num1'
-		if _rc==110 {
-			local num1=`num1'+1
-			cap rename `var' ``var'_lab'_`num1'
-		}
-	}
+    local `var'_lab : variable label `var'
+    loc `var'_lab = subinstr(`"``var'_lab'"', "/", "", .)
+    di `"``var'_lab'"'
+    cap rename `var' ``var'_lab'
+    if _rc == 110 {
+        cap rename ``var'_lab' ``var'_lab'_1
+        cap rename `var' ``var'_lab'_`num1'
+        if _rc == 110 {
+            local num1 = `num1' + 1
+            cap rename `var' ``var'_lab'_`num1'
+        }
+    }
 }
 
-assert `num1'==`RepeatNum' // check if all household members have been accounted in the loop
+assert `num1' == `RepeatNum' // check if all household members have been accounted in the loop
 
 qui foreach var of varlist * {
-	cap destring `var', replace i("n/a") // destring variables that have "n/a", which will be replaced with "." (as per Stata convention)
+    cap destring `var', replace i("n/a") // destring variables that have "n/a", which will be replaced with "." (as per Stata convention)
 }
 
-// repeat for the number of HH members participating.
-forval i=1(1)`RepeatNum' {
-	local PTrainingPart
-	// loop is set to account for up to 9 training types, with 0 as no training.
-	forval j=1(1)9 {
-		cap confirm var PTrainingTypes`j'_`i'
-		if _rc==0 {
-			local PTrainingPart `PTrainingPart' PTrainingTypes`j'_`i'
-		}
-	}
-	
-	egen PostTrainingEngagement_`i'=rowmax(PPostTrainingEmpl_`i' PPostTrainingIncome_`i') // if individual was either employed or started a self-employment
-	local PostTrainingEngagement `PostTrainingEngagement' PostTrainingEngagement_`i' 
-	
-	egen PTrainingPart_`i'=rowmax(`PTrainingPart') // if individual participated at least a training activity in the list
-	local PTrainingPartNb `PTrainingPartNb' PTrainingPart_`i'
-	
+* Repeat for the number of HH members participating
+forval i = 1(1)`RepeatNum' {
+    local PTrainingPart
+    * Loop is set to account for up to 9 training types, with 0 as no training
+    forval j = 1(1)9 {
+        cap confirm var PTrainingTypes`j'_`i'
+        if _rc == 0 {
+            local PTrainingPart `PTrainingPart' PTrainingTypes`j'_`i'
+        }
+    }
+    
+    egen PostTrainingEngagement_`i' = rowmax(PPostTrainingEmpl_`i' PPostTrainingIncome_`i') // if individual was either employed or started a self-employment
+    local PostTrainingEngagement `PostTrainingEngagement' PostTrainingEngagement_`i' 
+    
+    egen PTrainingPart_`i' = rowmax(`PTrainingPart') // if individual participated at least a training activity in the list
+    local PTrainingPartNb `PTrainingPartNb' PTrainingPart_`i'
 }
-// variables (counts and shares) are still at household level
-egen PostTrainingEngagement=rowtotal(`PostTrainingEngagement')
+
+* Variables (counts and shares) are still at household level
+egen PostTrainingEngagement = rowtotal(`PostTrainingEngagement')
 label var PostTrainingEngagement "Number of training participants engaging in income generating activities (self-employed or salaried)"
-egen PTrainingPartNb=rowtotal(`PTrainingPartNb')
+egen PTrainingPartNb = rowtotal(`PTrainingPartNb')
 label var PTrainingPartNb "Number of training participants"
 
-gen EIG=PostTrainingEngagement/PTrainingPartNb
+gen EIG = PostTrainingEngagement / PTrainingPartNb
 label var EIG "Share of training participants who were able to engage in income generating activities post-training"
 cap drop `PTrainingPartNb' `PostTrainingEngagement'
-// example of summary statistic for full sample, more analysis code is provided in the dedicated repository
-su EIG
 
+* Example of summary statistic for full sample, more analysis code is provided in the dedicated repository
+sum EIG
 
-/* END OF DO-FILE */
\ No newline at end of file
+* End of Scripts
\ No newline at end of file
diff --git a/Indicators/Engagement-in-income-generating-activities/EIG-indicator.py b/Indicators/Engagement-in-income-generating-activities/EIG-indicator.py
new file mode 100644
index 0000000..a2ff214
--- /dev/null
+++ b/Indicators/Engagement-in-income-generating-activities/EIG-indicator.py
@@ -0,0 +1,39 @@
+#------------------------------------------------------------------------------
+#                          WFP Standardized Scripts
+#    Engagement in Income Generation Activities (EIG) Calculation
+#------------------------------------------------------------------------------
+
+# This script calculates the Engagement in Income Generation Activities (EIG)
+# using standard variable names and sample data.
+# Detailed guidelines can be found in the WFP documentation.
+
+import pandas as pd
+
+# Add sample data
+data = pd.read_csv("~/GitHub/RAMResourcesScripts/Static/EIG_Sample_Survey.csv")
+
+# Rearrange variable names to ensure consistency in the dataset
+data.columns = [col.replace("/", "") for col in data.columns]
+
+# Loop to account for up to 9 training types
+for i in range(1, 10):
+    training_col = f'PTrainingTypes{i}'
+    if training_col in data.columns:
+        data[training_col] = data[training_col].replace('n/a', pd.NA).astype(float)
+
+# Calculate engagement in income generation activities
+data['PostTrainingEngagement'] = data[['PPostTrainingEmpl', 'PPostTrainingIncome']].max(axis=1)
+data['PTrainingPart'] = data[[col for col in data.columns if 'PTrainingTypes' in col]].sum(axis=1)
+
+# Calculate household level variables
+household_data = data.groupby('household_id').agg(
+    PostTrainingEngagement=('PostTrainingEngagement', 'sum'),
+    PTrainingPartNb=('PTrainingPart', 'sum')
+).reset_index()
+
+household_data['EIG'] = household_data['PostTrainingEngagement'] / household_data['PTrainingPartNb']
+
+# Summary statistics for full sample
+print(household_data['EIG'].describe())
+
+# End of Scripts
\ No newline at end of file
diff --git a/Indicators/Engagement-in-income-generating-activities/EIG-indicator.sps b/Indicators/Engagement-in-income-generating-activities/EIG-indicator.sps
new file mode 100644
index 0000000..6c76dc5
--- /dev/null
+++ b/Indicators/Engagement-in-income-generating-activities/EIG-indicator.sps
@@ -0,0 +1,81 @@
+*------------------------------------------------------------------------------
+*                          WFP Standardized Scripts
+*    Engagement in Income Generation Activities (EIG) Calculation
+*------------------------------------------------------------------------------
+
+* This script calculates the Engagement in Income Generation Activities (EIG)
+* using standard variable names and sample data.
+* Detailed guidelines can be found in the WFP documentation.
+
+* Import dataset
+PRESERVE.
+SET DECIMAL DOT.
+
+GET DATA /TYPE=TXT
+  /FILE="C:\Users\b\Desktop\demo\EIG_Sample_Survey.csv"
+  /ENCODING='UTF8'
+  /DELCASE=LINE
+  /DELIMITERS=","
+  /ARRANGEMENT=DELIMITED
+  /FIRSTCASE=2
+  /VARIABLES=
+  v1 AUTO
+  v2 AUTO
+  v3 AUTO
+  * Add additional variables as needed
+  /MAP.
+RESTORE.
+
+CACHE.
+EXECUTE.
+DATASET NAME DataSet1 WINDOW=FRONT.
+
+* Rearrange variable names and codes to ensure consistency in the dataset.
+* In particular, variables within repeats are imported with progressive integer names (v1, v2, v3, ...).
+* The loop below names variables as [VariableName]+[_number of option]+[_number of repetition].
+
+* Get the maximum number of repeats.
+FREQUENCIES VARIABLES=v1 /FORMAT=NOTABLE /STATISTICS=MAXIMUM.
+
+* Rename variables to ensure consistency.
+* This assumes variable labels follow a specific pattern and may need adjustment based on actual data.
+DO REPEAT oldvar=v1 TO v9 /index=1 TO 9.
+  RENAME VARIABLES (oldvar = Variable_!index).
+END REPEAT.
+
+* Convert "n/a" to missing values and destring variables.
+DO REPEAT var=Variable_1 TO Variable_9.
+  RECODE var ('n/a' = SYSMIS) INTO var. 
+  EXECUTE.
+END REPEAT.
+
+* Calculate indicators for each repeat.
+DO REPEAT i=1 TO 9.
+  * Check participation in training activities.
+  COMPUTE PTrainingPart_!i = MAX(PTrainingTypes1_!i, PTrainingTypes2_!i, PTrainingTypes3_!i, PTrainingTypes4_!i, PTrainingTypes5_!i, PTrainingTypes6_!i, PTrainingTypes7_!i, PTrainingTypes8_!i, PTrainingTypes9_!i).
+  * Check engagement in income generating activities post-training.
+  COMPUTE PostTrainingEngagement_!i = MAX(PPostTrainingEmpl_!i, PPostTrainingIncome_!i).
+  EXECUTE.
+END REPEAT.
+
+* Aggregate indicators to the household level.
+AGGREGATE
+  /OUTFILE=* MODE=ADDVARIABLES
+  /BREAK=
+  /PostTrainingEngagement = SUM(PostTrainingEngagement_1 TO PostTrainingEngagement_9)
+  /PTrainingPartNb = SUM(PTrainingPart_1 TO PTrainingPart_9).
+
+* Calculate the EIG indicator.
+COMPUTE EIG = PostTrainingEngagement / PTrainingPartNb.
+VARIABLE LABELS EIG "Share of training participants who were able to engage in income generating activities post-training".
+EXECUTE.
+
+* Drop unnecessary variables.
+DELETE VARIABLES PostTrainingEngagement_1 TO PostTrainingEngagement_9 PTrainingPart_1 TO PTrainingPart_9.
+EXECUTE.
+
+* Example of summary statistics for the full sample.
+FREQUENCIES VARIABLES=EIG /STATISTICS=MEAN.
+EXECUTE.
+
+* End of Scripts
\ No newline at end of file