\documentclass[a4paper,10pt]{article}
\usepackage{amsmath}
\usepackage{amsfonts}
\usepackage{amssymb}
\usepackage{amsthm}
\usepackage{setspace}
\usepackage{harvard}
\usepackage{aer}
\usepackage{fullpage}
\usepackage{graphicx}
\newcommand{\indep}{\perp\!\!\!\perp}
\newcommand{\argmax}{\operatornamewithlimits{arg\,max}}
\newcommand{\argmin}{\operatornamewithlimits{arg\,min}}
\newcommand{\plim}{\operatornamewithlimits{plim}}
\newcommand{\citefull}[1]{\citename{#1} \citeyear{#1}}
\newcommand{\citeparagraph}[1]{\medskip\noindent\textbf{{\citename{#1} \citeyear{#1}}}}
\newcommand{\cov}{\text{Cov}}
\newcommand{\var}{\text{Var}}
\newcommand{\rank}{\text{rank}}
%\newcommand{\det}{\text{det}}
\def\inprobLOW{\rightarrow_p}
\def\inprobHIGH{\,{\buildrel p \over \rightarrow}\,}
\def\as{\,{\buildrel a.s. \over \rightarrow}\,}
\def\asu{\,{\buildrel a.s.u. \over \rightarrow}\,}
\def\inprob{\,{\inprobHIGH}\,}
\def\indist{\,{\buildrel d \over \rightarrow}\,}
% defined environments
\newtheorem{thm}{Theorem} %[section]
\newtheorem{cor}[thm]{Corollary}
\newtheorem{lem}[thm]{Lemma}
\newtheorem{prop}[thm]{Proposition}
\theoremstyle{remark}
\newtheorem{rem}[thm]{Remark}
\newtheorem{ex}[thm]{Example}
\theoremstyle{definition}
\newtheorem{defn}[thm]{Definition}
\title{14.385 Recitation 1}
\author{Paul Schrimpf}
\begin{document}
\maketitle
\paragraph{Types of Estimators}
\begin{itemize}
\item Moment methods
\begin{itemize}
\item GMM
\item IV
\end{itemize}
\item Extremum methods
\begin{itemize}
\item MLE
\item M-estimators
\item Quantile regression
\item \textbf{Minimum Distance}
\end{itemize}
\end{itemize}
\begin{center}
\begin{figure}[hbpt] \caption{Relationship among estimators}
\begin{minipage}[hbpt]{\linewidth}
\includegraphics[width=\textwidth]{neweyMcFaddenFig1}
\footnote{Source: Newey and McFadden (1994)}
\end{minipage}
\end{figure}
\end{center}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newpage
\section{Minimum Distance}
\begin{align}
\hat{\theta} = \argmin \hat{f}_n(\theta)' \hat{W} \hat{f}_n(\theta)
\end{align}
where $\plim \hat{f}_n(\theta_0) = 0$.
\paragraph{Includes:}
\begin{itemize}
\item GMM with $\hat{f}_n(\theta) = \frac{1}{n} \sum
g(z_i,\theta)$ and $\hat{W} = \hat{A}$
\item MLE with $\hat{f}_n(\theta) = \frac{1}{n} \sum \frac{\partial
\ln f(z_i|\theta)}{\partial \theta}$ and $\hat{W} = I$
\item Classical Minimum Distance (CMD): $\hat{f}_n(\theta) = \hat{\pi}
- h(\theta)$ where $\hat{\pi} \inprob \pi_0 = h(\theta_0)$.
Usually, $\pi$ are reduced form parameters, $\theta$ are structural
parameters, and $h(\theta)$ is a mapping from the structural
parameters to the reduced form.
\begin{itemize}
\item \emph{Example:} Chamberlain (1982, HoE 1984) approach to panel data.
Model:
\begin{align*}
y_{it} = x_{it}\beta + c_i + e_{it} \text{ , }
E[e_{it}|x_{i},c_i] = 0
\end{align*}
Reduced form: regress $y_{it}$ on all $x_{i\cdot}$ to get
$\pi_{t}$. \\
$h(\theta)$: we know that $x_{i\cdot}\pi_t$ is the best linear
predictor of $y_{it}$ given $x_{i\cdot}$. We also know that
\begin{align*}
BLP(y_{it}|x_{i\cdot}) = & BLP(x_{it}\beta + e_{it} |x_{i\cdot}) +
BLP(c_i|x_{i\cdot}) \\
= & x_{it} \beta + x_{i\cdot} \lambda
\end{align*}
So if we stack the $\pi_t$ into a $t \times tk$ matrix $\pi$, we know
that
\begin{align*}
\pi = h(\beta,\lambda) = I_T \otimes \beta' + \imath_T \lambda'
\end{align*}
where $\beta$ is $k \times 1$ and $\lambda$ is $tk \times 1$.
\end{itemize}
\item Indirect Inference: is mathematically the same as CMD,
$\hat{f}_n(\theta) = \hat{\pi} - h(\theta)$ where $\hat{\pi} \inprob
\pi_0 = h(\theta_0)$, but the justification is slightly different. We
have an economic model, which we are not entirely certain is the
true DGP (or perhaps is just difficult to compute the likelihood
for), but we do believe can capture some important features of
the data. These features of the data are summarized by the
parameters of an easy to estimate auxillary model, $\pi$.
$h(\theta)$ gives the estimates of the auxillary model that we would
expect if our economic model were the true DGP and had parameters
$\theta$. $h(\theta)$ is often calculated through simulation.
\begin{itemize}
\item \emph{Example: DSGE} (taken from 14.384 notes, for a real
application see e.g.\ Del Negro,Schorfheide, Smets and Wouters
(2007))
Consider a simple RBC:
\begin{align*}
\max & E_0 \sum \omega^t \frac{c^{-\gamma}-1}{\gamma} \\
\text{s.t. } & c_t + i_t = A \lambda_t k_t^\alpha \\
& k_{t+1} = (1-\delta) k_t + i_t \\
& \lambda_t = \rho \lambda_{t-1} + \epsilon_t \,\,\, \epsilon_t
\sim N(0,\sigma^2)
\end{align*}
This model has many parameters
($\theta = (\omega,\gamma,A,\alpha,\delta,\rho,\sigma^2)$) and it
would be
difficult to write down a likelihood or moment functions.
Moreover, we don't really believe that this model is the true DGP
and we don't want to use it to explain all aspects of the data.
Instead we just want the model to explain some feature of the
data, say the dynamics as captured by VAR coefficients. Also,
although it is hard to write the likelihood function for this
model, it is fairly easy to simulate the model. The we can use
indirect inference as follows:
\begin{enumerate}
\item Estimate (possibly misspecified) VAR from data. A VAR is
simply OLS on:
\begin{align*}
Y_t = \pi_1 Y_{t-1} + ... + \pi_pY_{t-p} + u_t
\end{align*}
where $Y_t$ is the vector of observed variables at time $t$. In
this example, $Y_t$ might be $c_t$ and $i_t$.
\item Given $\beta$, simulate model, estimate VAR from
simulations, repeat until minimize objective function
\end{enumerate}
\end{itemize}
\end{itemize}
\subsection{Consistency}
Recall the general theorem on consistency from lecture 2:
\begin{thm}
If (i) $Q(\theta)$ is uniquely minimized at the true parameter value
$\theta_0$, (ii) $\Theta$ is compact, (iii) $Q(\cdot)$ is
continuous, and (iv) $\sup_{\theta \in \Theta} |\hat{Q}(\theta) -
Q(\theta)| \inprob 0$, then $\hat{\theta} \inprob \theta_0$.
\end{thm}
We will now discuss applying this theorem to minimum distance. To do
that, we need to verify each of the conditions:
\begin{enumerate}
\item (Identification): suppose $\hat{f}_n(\theta) \inprob f(\theta)$
and $\hat{W} \inprob W$, so that $Q(\theta) = f(\theta)' W
f(\theta)$. As with GMM, showing that this function has a unique
minimum is difficult. A local identification condition is that
$rank \frac{\partial f}{\partial \theta} = p$, where $\theta$ is $p
\times 1$. Global identification is typically just assumed.
\item (Compactness): assume it.
\item (Continuity): depends on the particular application. For CMD
and indirect inference, $f(\theta) = \pi - h(\theta)$ is continuous
as long as $h(\theta)$ is continuous. Since $h(\theta)$ does not
depend on the data at all, this condition is easily checked. In the
panel data example, $h(\theta)$ is obviously continuous.
\item (Uniform Convergence): depends on the particular application.
Recall lemma 3 from lecture 2. It was:
\begin{lem}
Suppose $\hat{Q}(\theta) \inprob Q(\theta)$ for each $\theta \in
\Theta$. Then uniform convergence holds if for some $h>0$, we
have uniformly for $\theta,\theta' \in \Theta$
\[ |\hat{Q}(\theta) - \hat{Q}(\theta')| \leq B_T ||\theta -
\theta'||^h \text{ , } B_T = O_p(1) \]
\end{lem}
For CMD and indirect inference,
\begin{align*} \hat{Q}(\theta) - \hat{Q}(\theta')
= & | (\hat{\pi} - h(\theta))'\hat{W} (\hat{\pi} - h(\theta)) -
(\hat{\pi} - h(\theta'))'\hat{W}(\hat{\pi} - h(\theta' ))' | \\
= & |2(h(\theta) - h(\theta') \hat{W} \hat{\pi} +
h(\theta)'\hat{W}h(\theta) - h(\theta')'\hat{W}h(\theta')|
\\
\leq & |2(h(\theta) - h(\theta') \hat{W} \hat{\pi}| +
|h(\theta)'\hat{W}h(\theta) - h(\theta')'\hat{W}h(\theta')|
\\
\leq & |2(h(\theta) - h(\theta') \hat{W} \hat{\pi}| +
|(h(\theta)-h(\theta'))'\hat{W}(h(\theta) - h(\theta'))|
\\
\end{align*}
so a sufficient condition is that
$h(\theta)$ is H\"{o}lder continuous on $\Theta$, i.e.
\[ |h(\theta) - h(\theta')| \leq K ||\theta - \theta'||^h \]
for some $h>0$ and all $\theta,\theta' \in \Theta$. A sufficient
condition for H\"{o}lder continuity is that $h()$ is differentiable
with a bounded derivative because then
\[ |h(\theta) - h(\theta')| \leq \sup_{\Theta} ||h'|| ||\theta -
\theta'|| \]
Clearly, this condition holds for the panel data example. It could
also be checked in other applications.
\begin{itemize}
\item If $h(\theta)$ is computed through simulation, then some
additional steps need to be taken to show consistency. Let
$h_S(\theta)$ denote the value of $h(\theta)$ computed from $S$
simulations. Typically, $h_S(\theta)$ will be some standard
estimator and we will know that $h_S(\theta) \inprob h(\theta)$ as
$S \rightarrow \infty$. For $\hat{Q}$ to converge uniformly, we
need to promise that $S \rightarrow \infty$ as $T \rightarrow
\infty$, and we will need the convergence of $h_S$ to $h$ to be
uniform in addition to the conditions above.
\end{itemize}
\end{enumerate}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newpage
\section{Checking Consistency}
When verifying consistency, always start by checking whether
$E\nabla Q(\theta_0) = 0$
\emph{Example: Probit}\footnote{Taken from Konrad's notes from
last year.}
\[ Q_n(\beta) = \frac{1}{n} \sum y_i \log(\Phi(x_i\beta)) + (1-y_i)
\log(1-\Phi(x_i\beta)) \]
The derivative is:\footnote{We could just as well calculate $Q =
\plim Q_n$ and then differentiate.}
\begin{align*}
\nabla Q_n(\beta) = & \frac{1}{n} \sum
\left(\frac{y_i}{\Phi(x_i\beta)} - \frac{1-y_i}{1-\Phi(x_i\beta)}
\right) \phi(x_i\beta)x_i \\
= & \frac{1}{n} \sum
\left(\frac{y_i -
\Phi(x_i\beta)}{\Phi(x_i\beta)(1-\Phi(x_i\beta))}\right)
\phi(x_i\beta)x_i
\end{align*}
which has plim
\begin{align*}
\plim \nabla Q_n(\beta) = E\left[
\left(\frac{E[y_i|x_i] -
\Phi(x_i\beta)}{\Phi(x_i\beta)(1-\Phi(x_i\beta))}\right)
\phi(x_i\beta)x_i \right]
\end{align*}
which is $0$ at $\beta_0$.
\end{document}