Machine learning as proxy
Heterogeneity in CATE for Birthweight
## Function for Generic machine learning of Chernozhukov, Demirer, Duflo, & Fernandez-Val (2018)
genericML <- function(x,y,treat, fit.function, predict.function,
n.split=10, n.group=5, clusterid=NULL) {
if (!is.null(clusterid)) require(sandwich)
blp <- matrix(NA, nrow=n.split, ncol=2)
blp.se <- blp
gate <- matrix(NA, nrow=n.split, ncol=n.group)
gate.se <- gate
baseline <- matrix(NA, nrow=nrow(x), ncol=n.split)
cate <- matrix(NA, nrow=nrow(x), ncol=n.split)
Lambda <- matrix(NA, nrow=n.split, ncol=2)
for(i in 1:n.split) {
main <- runif(nrow(x))>0.5
fit1 <- fit.function(x[!main & treat==1,], y[!main & treat==1])
fit0 <- fit.function(x[!main & treat==0,], y[!main & treat==0])
B <- as.vector(predict.function(fit0,x))
S <- as.vector(predict.function(fit1,x)) - B
baseline[,i] <- B
cate[,i] <- S
ES <- mean(S)
## BLP
# assume P(treat|x) = P(treat) = mean(treat)
p <- mean(treat)
df <- data.frame(y, B, treat, S, main)
reg <- lm(y ~ B + I(treat-p) + I((treat-p)*(S-ES)), data=subset(df, main))
blp[i,] <- reg$coef[3:4]
if (is.null(clusterid)) blp.se[i,] <- sqrt(diag(vcovHC(reg))[3:4])
else blp.se[i,] <- sqrt(diag(vcovCL(reg, clusterid[main]))[3:4])
Lambda[i,1] <- reg$coefficient[4]^2*var(S)
## GATES
cut <- quantile(S, probs=seq(0,1,length.out=(n.group+1)))
cut[n.group+1] <- cut[n.group+1] + 1
for(k in 1:n.group) {
df[,sprintf("G.%d",k)] <- (cut[k]<=S & S<cut[k+1])
}
greg <- lm(as.formula(paste(c("y ~ B ", sprintf("I((treat-p)*G.%d)",1:n.group)), collapse=" + ")),
data=subset(df,main))
gc <- grep("G", names(greg$coefficients))
gate[i,] <- greg$coefficients[gc]
if (is.null(clusterid)) gate.se[i,] <- sqrt(diag(vcovHC(greg))[gc])
else gate.se[i,] <- sqrt(diag(vcovCL(greg, clusterid[main]))[gc])
Lambda[i,2] <- sum(gate[i,]^2)/n.group
}
out <- list( gate=gate, gate.se=gate.se, blp=blp, blp.se=blp.se, Lambda=Lambda, baseline=baseline, cate=cate)
}
genericML.summary <- function(gml)
{
blp <- apply(gml$blp, 2, function(x) median(x, na.rm=TRUE))
blp.se <- apply(gml$blp.se, 2, function(x) median(x, na.rm=TRUE))
gate <- apply(gml$gate, 2, function(x) median(x, na.rm=TRUE))
gate.se <- apply(gml$gate.se, 2, function(x) median(x, na.rm=TRUE))
Lambda <- apply(gml$Lambda, 2, function(x) median(x, na.rm=TRUE))
return(list(blp=blp, blp.se=blp.se, gate=gate, gate.se=gate.se, Lambda=Lambda))
}
library(glmnet)
# create x matrix
fmla.l <- bw ~ pkh_kec_ever +
as.factor(edu)*as.factor(agecat) + log_xp_percap + hh_land + hh_home + as.factor(dist) +
hh_phone + hh_rf_tile + hh_rf_shingle + hh_rf_fiber +
hh_wall_plaster + hh_wall_brick + hh_wall_wood + hh_wall_fiber +
hh_fl_tile + hh_fl_plaster + hh_fl_wood + hh_fl_dirt +
hh_water_pam + hh_water_mechwell + hh_water_well + hh_water_spring + hh_water_river +
hh_waterhome +
hh_toilet_own + hh_toilet_pub + hh_toilet_none +
hh_waste_tank + hh_waste_hole + hh_waste_river + hh_waste_field +
hh_kitchen +
hh_cook_wood + hh_cook_kerosene + hh_cook_gas +
tv + fridge + motorbike + car + goat + cow + horse
m <- lm(fmla.l, data=pw, x=TRUE, y=TRUE)
treat <- m$x[,2]
Xl <- m$x[,3:ncol(m$x)]
scale <- sd(m$y)
center <- mean(m$y)
yl <- (m$y-center)/scale
lid <- as.factor(pw[as.numeric(rownames(m$x)),]$Location_ID)
gml.lasso <- genericML(Xl,m$y, treat,
function(x,y) cv.glmnet(x,(y-center)/scale,alpha=1,parallel=FALSE, intercept=TRUE, nfolds=20),
function(model, x) { predict(model, x, s=model$lambda.min, type="response")*scale + center },
n.split=11,n.group=5, clusterid=lid)
library(grf)
gml.rf <- genericML(Xl,m$y, treat,
function(x,y) regression_forest(x, (y-center)/scale, tune.parameters=TRUE),
function(model, x) { predict(model,x)$predictions*scale + center},
n.split=11,n.group=5, clusterid=lid)
library(RSNNS)
gml.nn <- genericML(Xl,m$y, treat,
function(x,y) mlp(x,(y-center)/scale,linOut=TRUE, size=c(10,10), learnFunc="SCG"),
function(model, x) { predict(model,x)*scale + center},
n.split=11,n.group=5, clusterid=lid)
library(GGally)
df <- data.frame(Lasso=apply(gml.lasso$cate,1,median),
Forest=apply(gml.rf$cate,1,median),
Neural=apply(gml.nn$cate,1,median))
ggpairs(df, lower=list(continuous="smooth")) + theme_minimal()

Best linear projection of CATE
Randomly partition sample into auxillary and main samples
Use any method on auxillary sample to estimate \[S(x) = \widehat{\Er[y(1) - y(0) | x]}\] and \[B(x) = \widehat{\Er[y(0)|x]}\]
Use main sample to regress with weights \((P(x)(1-P(X)))^{-1}\) \[
y = \alpha_0 + \alpha_1 B(x) + \beta_0 (d-P(x)) + \beta_1
(d-P(x))(S(x) - \Er[S(x)]) + \epsilon
\]
\(\hat{\beta}_0, \hat{\beta}_1 \to_p \argmin_{b_0,b_1} \Er[(s_0(x) - b_0 - b_1 (S(x)-E[S(x)]))^2]\)
\(\Lambda = \beta_1^2 Var(S(x)) = corr(s_0(x),S(X))^2 Var(s_0(x))\)
library(kableExtra)
colfn <- function(gml) {
s <- genericML.summary(gml)
c(s$blp[1], s$blp.se[1], s$blp[2], s$blp.se[2], s$Lambda[1])
}
tbl <- cbind(colfn(gml.lasso), colfn(gml.rf), colfn(gml.nn))
colnames(tbl) <- c("Lasso","Regression forest","Neural network")
rownames(tbl) <- c("ATE=b0","se","b1","se","Lambda")
kable_styling(kable(tbl,
caption="Machine learning proxies as BLP of CATE on Birthweight",
format="html", digits=3),
bootstrap_options = c("striped", "hover", "condensed",
"responsive"), full_width=TRUE)
Machine learning proxies as BLP of CATE on Birthweight
|
Lasso
|
Regression forest
|
Neural network
|
ATE=b0
|
-3.325
|
-12.390
|
-9.003
|
se
|
33.095
|
32.024
|
31.416
|
b1
|
0.169
|
0.435
|
0.114
|
se
|
0.480
|
0.653
|
0.118
|
Lambda
|
562.187
|
574.082
|
403.610
|
Group average treatment effects
Define \(G_k = 1\{\ell_{k-1} \leq S(x) \leq \ell_k\}\) with \(\ell_k = k/5\) quantile of \(S(x)\)
Use main sample to regress with weights \((P(x)(1-P(X)))^{-1}\) \[
y = \alpha_0 + \alpha_1 B(x) + \sum_k \gamma_k (d-P(X)) 1(G_k) +
\epsilon
\]
\(\hat{\gamma}_k \to_p \Er[y(1) - y(0) | G_k]\)
\(\bar{\Lambda} = \frac{1}{K} \sum_k \gamma_k^2\)
library(kableExtra)
colfn <- function(gml) {
s <- genericML.summary(gml)
c(s$gate[1], s$gate.se[1],
s$gate[2], s$gate.se[2],
s$gate[3], s$gate.se[3],
s$gate[4], s$gate.se[4],
s$gate[5], s$gate.se[5],
s$Lambda[2])
}
tbl <- cbind(colfn(gml.lasso), colfn(gml.rf), colfn(gml.nn))
colnames(tbl) <- c("Lasso","Regression forest","Neural network")
rownames(tbl) <- c(as.vector(sapply(1:5, function(x) c(sprintf("GATE %d",x),"se"))),"Lambda")
kable_styling(kable(tbl,
caption="GATE on Birthweight",
format="html", digits=3),
bootstrap_options = c("striped", "hover", "condensed",
"responsive"), full_width=TRUE)
GATE on Birthweight
|
Lasso
|
Regression forest
|
Neural network
|
GATE 1
|
-16.413
|
-28.554
|
-14.990
|
se
|
71.836
|
72.023
|
62.556
|
GATE 2
|
44.850
|
-39.772
|
-30.630
|
se
|
67.828
|
64.132
|
67.542
|
GATE 3
|
-41.362
|
-1.857
|
-5.024
|
se
|
71.415
|
63.443
|
68.606
|
GATE 4
|
-25.154
|
-28.400
|
0.072
|
se
|
69.296
|
65.399
|
70.607
|
GATE 5
|
12.703
|
54.117
|
84.213
|
se
|
72.644
|
76.440
|
71.837
|
Lambda
|
2850.894
|
6524.788
|
3937.901
|
Heterogeneity in CATE on Midwife utilization
# create x matrix
mwb <- pw[as.numeric(rownames(m$x)),]$midwife_birth
mw.lasso <- genericML(Xl,mwb, treat,
function(x,y) cv.glmnet(x,y,family="binomial",
alpha=1,parallel=FALSE, intercept=TRUE, nfolds=20),
function(model, x) { predict(model, x, s=model$lambda.min, type="response") },
n.split=11,n.group=5, clusterid=lid)
library(grf)
mw.rf <- genericML(Xl,mwb, treat,
function(x,y) regression_forest(x, y, tune.parameters=TRUE),
function(model, x) { predict(model,x)$predictions },
n.split=11,n.group=5, clusterid=lid)
library(RSNNS)
mw.nn <- genericML(Xl,mwb, treat,
function(x,y) mlp(x, y, linOut=FALSE, size=c(10,10), learnFunc="SCG"),
function(model, x) { predict(model,x) },
n.split=11,n.group=5, clusterid=lid)
library(GGally)
df <- data.frame(Lasso=apply(mw.lasso$cate,1,median),
Forest=apply(mw.rf$cate,1,median),
Neural=apply(mw.nn$cate,1,median))
ggpairs(df, lower=list(continuous="smooth")) + theme_minimal()

library(kableExtra)
colfn <- function(gml) {
s <- genericML.summary(gml)
c(s$blp[1], s$blp.se[1], s$blp[2], s$blp.se[2], s$Lambda[1])
}
tbl <- cbind(colfn(mw.lasso), colfn(mw.rf), colfn(mw.nn))
colnames(tbl) <- c("Lasso","Regression forest","Neural network")
rownames(tbl) <- c("ATE=b0","se","b1","se","Lambda")
kable_styling(kable(tbl,
caption="Machine learning proxies as BLP of CATE on Midwife Use",
format="html", digits=3),
bootstrap_options = c("striped", "hover", "condensed",
"responsive"), full_width=TRUE)
Machine learning proxies as BLP of CATE on Midwife Use
|
Lasso
|
Regression forest
|
Neural network
|
ATE=b0
|
0.057
|
0.058
|
0.056
|
se
|
0.024
|
0.024
|
0.025
|
b1
|
0.368
|
0.606
|
0.150
|
se
|
0.167
|
0.263
|
0.088
|
Lambda
|
0.003
|
0.003
|
0.002
|
library(kableExtra)
colfn <- function(gml) {
s <- genericML.summary(gml)
c(s$gate[1], s$gate.se[1],
s$gate[2], s$gate.se[2],
s$gate[3], s$gate.se[3],
s$gate[4], s$gate.se[4],
s$gate[5], s$gate.se[5],
s$Lambda[2])
}
tbl <- cbind(colfn(mw.lasso), colfn(mw.rf), colfn(mw.nn))
colnames(tbl) <- c("Lasso","Regression forest","Neural network")
rownames(tbl) <- c(as.vector(sapply(1:5, function(x) c(sprintf("GATE %d",x),"se"))),"Lambda")
kable_styling(kable(tbl,
caption="GATE on Midwife Use",
format="html", digits=3),
bootstrap_options = c("striped", "hover", "condensed",
"responsive"), full_width=TRUE)
GATE on Midwife Use
|
Lasso
|
Regression forest
|
Neural network
|
GATE 1
|
0.012
|
0.000
|
0.036
|
se
|
0.049
|
0.054
|
0.057
|
GATE 2
|
-0.023
|
-0.024
|
0.030
|
se
|
0.052
|
0.051
|
0.058
|
GATE 3
|
0.022
|
0.021
|
0.044
|
se
|
0.052
|
0.050
|
0.047
|
GATE 4
|
0.056
|
0.058
|
0.034
|
se
|
0.056
|
0.050
|
0.053
|
GATE 5
|
0.169
|
0.178
|
0.137
|
se
|
0.053
|
0.053
|
0.052
|
Lambda
|
0.008
|
0.009
|
0.009
|
Covariate means by group
df <- pw[as.numeric(rownames(m$x)),]
df$edu99 <- df$edu==99
df$educ <- df$edu
df$educ[df$educ==99] <- NA
vars <- c("log_xp_percap","agecat","educ","tv","goat",
"hh_toilet_own","motorbike","hh_cook_wood","pkh_ever")
tmp <- data.frame()
groupMeans <- function(var, gml, clusterid) {
n.group <- ncol(gml$gate)
gate <- matrix(NA, nrow=nrow(gml$gate), ncol=ncol(gml$gate))
gate.se <- gate
dat <- data.frame(y=var)
for (i in 1:ncol(gml$cate)) {
S <- gml$cate[,i]
cut <- quantile(S, probs=seq(0,1,length.out=(n.group+1)))
cut[n.group+1] <- cut[n.group+1] + 1
for(k in 1:n.group) {
dat[,sprintf("G.%d",k)] <- 1*(cut[k]<=S & S<cut[k+1])
}
greg <- lm(as.formula(paste(c("y ~ -1", sprintf("G.%d",1:n.group)), collapse=" + ")),
data=dat)
gc <- grep("G", names(greg$coefficients))
gate[i,] <- greg$coefficients[gc]
if (is.null(clusterid)) gate.se[i,] <- sqrt(diag(vcovHC(greg))[gc])
else gate.se[i,] <- sqrt(diag(vcovCL(greg,clusterid))[gc])
}
return(list(mean=apply(gate, 2, function(x) median(x,na.rm=TRUE)),
se = apply(gate.se, 2, function(x) median(x,na.rm=TRUE))))
}
methods <- c("Lasso","Forest","Neural")
gmls <- list(mw.lasso,mw.rf,mw.nn)
for(v in vars) {
for (m in 1:length(methods)) {
gm <- groupMeans(df[,v], gmls[[m]], lid)
tmp <- rbind(tmp, data.frame(group=1:length(gm$mean),variable=v, method=methods[m],
mean=gm$mean, se=gm$se))
}
}
library(ggplot2)
fig <- ggplot(data=tmp, aes(x=group, y=mean, colour=method)) +
geom_line() +
geom_line(aes(y=(mean+1.96*se), colour=method), linetype=2) +
geom_line(aes(y=(mean-1.96*se), colour=method), linetype=2) +
facet_wrap(~ variable,scales="free_y") + theme_minimal()
print(fig)
