using Distributions, Random, Statistics
using Plots, LaTeXStrings
using Plots.PlotMeasures: mm


Plots.theme(:wong2)
gr(fmt=:png)

default(
    fontfamily     = "Computer Modern",
    titlefontsize  = 12,
    guidefontsize  = 11,
    tickfontsize   = 9,
    legendfontsize = 9,
    left_margin    = 12mm,
    bottom_margin  = 10mm,
    gridalpha      = 0.15,
    framestyle     = :box,
    lw             = 2,
    size           = (900, 500)
)


function dgp_keane_neal(; b=0, n=1000, F, rho)

    """
    Generates one sample of size n following the DGP of Keane & Neal (2024) p. 193.

    ### Input
    - `b`   -- structural coefficient β (default 0)
    - `n`   -- sample size (default 1000)
    - `F`   -- population first-stage F-statistic
    - `rho` -- degree of endogeneity ρ

    ### Output (named tuple)
    - `x`, `y`, `z` -- (n×1) vectors for regressor, outcome, and instrument
    """

    π   = sqrt(F / n)
    u   = randn(n)
    eta = randn(n)
    z   = randn(n)
    v   = rho * u + sqrt(1 - rho^2) * eta
    x   = π * z .+ v
    y   = b * x .+ u

    return (; x, y, z)

end

dgp_keane_neal (generic function with 1 method)


function ols_estimator(x, y)

    """
    OLS estimator for the simple linear model y = βx + u.

    ### Input
    - `x` -- (n×1) regressor vector
    - `y` -- (n×1) outcome vector

    ### Output (named tuple)
    - `bhat` -- OLS estimate of β
    - `se`   -- standard error
    - `t`    -- t-statistic
    """

    bhat = x \ y
    uhat = y - x * bhat
    s    = (uhat' * uhat) / length(y)
    se   = sqrt(s / (x' * x))
    t    = bhat / se

    return (; bhat, se, t)

end

ols_estimator (generic function with 1 method)


function iv_estimator(x, y, z)

    """
    Just-identified IV estimator with one endogenous variable and one instrument.

    ### Input
    - `x` -- (n×1) endogenous regressor
    - `y` -- (n×1) outcome vector
    - `z` -- (n×1) instrument vector

    ### Output (named tuple)
    - `bhat` -- IV estimate of β
    - `se`   -- standard error (Keane & Neal 2024, p. 190)
    - `t`    -- t-statistic

    ### Notes
    The SE uses the first-stage ESS = N·π̂²·Var(z) as the relevance measure.
    """

    bhat  = (z' * y) / (z' * x)   # β̂_IV = (z'y)/(z'x)

    n     = length(y)
    pihat = z \ x                  # first-stage coefficient π̂
    ESS   = n * pihat^2 * var(z)   # first-stage explained sum of squares
    uhat  = y - x * bhat
    s     = (uhat' * uhat) / n
    se    = sqrt(s / ESS)

    t = bhat / se

    return (; bhat, se, t)

end

iv_estimator (generic function with 1 method)


function simulate_distribution(; b=0, F, rho, n=1000, rep=10000)

    """
    Monte Carlo simulation of OLS and IV estimator distributions.

    Creates `rep` independent datasets from dgp_keane_neal and collects
    estimates, standard errors, and t-statistics for OLS, IV, and the
    Anderson-Rubin (AR) test statistic.

    ### Input
    - `b`   -- true structural coefficient β (default 0)
    - `F`   -- population first-stage F-statistic
    - `rho` -- degree of endogeneity ρ
    - `n`   -- sample size (default 1000)
    - `rep` -- number of Monte Carlo replications (default 10,000)

    ### Output (named tuple of rep-length vectors)
    - `bols_dst`, `sols_dst`, `tols_dst` -- OLS estimate, SE, t-statistic
    - `biv_dst`,  `siv_dst`,  `tiv_dst`  -- IV  estimate, SE, t-statistic
    - `ar_dst`                            -- AR t-statistic (OLS of Y on Z)
    """

    bols_dst = Vector{Float64}(undef, rep)
    sols_dst = Vector{Float64}(undef, rep)
    tols_dst = Vector{Float64}(undef, rep)
    biv_dst  = Vector{Float64}(undef, rep)
    siv_dst  = Vector{Float64}(undef, rep)
    tiv_dst  = Vector{Float64}(undef, rep)
    ar_dst   = Vector{Float64}(undef, rep)

    for i in 1:rep
        x, y, z = dgp_keane_neal(b=b, F=F, rho=rho, n=n)

        bols_dst[i], sols_dst[i], tols_dst[i] = ols_estimator(x, y)
        biv_dst[i],  siv_dst[i],  tiv_dst[i]  = iv_estimator(x, y, z)
        ar_dst[i] = ols_estimator(z, y).t   # AR: regress Y on Z directly
    end

    return (; bols_dst, sols_dst, tols_dst, biv_dst, siv_dst, tiv_dst, ar_dst)

end

simulate_distribution (generic function with 1 method)


function power_function(; brange=-1.00:0.10:1.00, F, rho, n=1000, rep=10000)

    """
    Computes empirical power of the OLS t-test, IV t-test, and AR test.

    Power = Pr(reject H₀: β = 0 | true β).

    ### Input
    - `brange` -- range of true β values (default -1.0:0.1:1.0)
    - `F`      -- population first-stage F-statistic
    - `rho`    -- degree of endogeneity ρ
    - `n`      -- sample size (default 1000)
    - `rep`    -- Monte Carlo replications per β value (default 10,000)

    ### Output
    - `brange`    -- the β grid (passed through)
    - `power_ols` -- empirical power of OLS t-test at each β
    - `power_iv`  -- empirical power of IV  t-test at each β
    - `power_ar`  -- empirical power of AR  test  at each β
    """

    power_ols = similar(brange)
    power_iv  = similar(brange)
    power_ar  = similar(brange)

    for (i, b) in enumerate(brange)
        sim         = simulate_distribution(b=b, F=F, rho=rho, n=n, rep=rep)
        power_ols[i] = mean(abs.(sim.tols_dst) .> 1.96)
        power_iv[i]  = mean(abs.(sim.tiv_dst)  .> 1.96)
        power_ar[i]  = mean(abs.(sim.ar_dst)   .> 1.96)
    end

    return brange, power_ols, power_iv, power_ar

end

power_function (generic function with 1 method)


Random.seed!(1234)   # set seed for reproducibility

# Reference case: no endogeneity, strong IV
dgp_zero = simulate_distribution(rho=0, F=73.75)

# Grid of DGPs
parms_rho = (0.10, 0.30, 0.50)
parms_F   = (1.82, 2.30, 10.00, 29.44, 73.75)

dgps = [simulate_distribution(rho=rho, F=F) for rho in parms_rho, F in parms_F];


plt = plot(
    layout      = (length(parms_rho), length(parms_F)),
    size        = (1800, 700),
    plot_title  = "Empirical Distribution of IV Standard Errors (truncated at 97th percentile)",
    plot_titlefontsize = 13)

for (i, rho) in enumerate(parms_rho), (j, F) in enumerate(parms_F)
    k = length(parms_F) * (i-1) + j
    histogram!(plt,
        dgps[i,j].siv_dst,
        normalize  = true,
        subplot    = k,
        bins       = range(0, quantile(dgps[i,j].siv_dst, 0.97), length=51),
        color      = "#6C9BC2",
        fillalpha  = 0.5,
        linecolor  = :white,
        linewidth  = 0.5,
        legend     = false,
        title      = L"\rho = %$(rho),\; F = %$(F)",
        titlefontsize = 10)
end

display(plt)


plot(dgp_zero.bols_dst, dgp_zero.sols_dst,
    seriestype  = :scatter,
    markersize  = 1.5,
    markeralpha = 0.3,
    markerstrokewidth = 0,
    legend      = false,
    size        = (800, 600),
    title       = "$(length(dgp_zero.bols_dst)) OLS estimates vs their standard errors\n(ρ = 0, popF = 73.75)",
    xlabel      = L"OLS estimate $\hat{\beta}_{\mathrm{OLS}}$",
    ylabel      = "Standard error")


rejected_ols = abs.(dgp_zero.tols_dst) .> 1.96

plt_ols = plot(dgp_zero.bols_dst, dgp_zero.sols_dst,
    seriestype        = :scatter,
    markersize        = 1.5,
    markeralpha       = 0.3,
    markerstrokewidth = 0,
    mc                = "#0072B2",
    legend            = false,
    size              = (800, 600),
    title             = "$(length(dgp_zero.bols_dst)) OLS estimates vs their standard errors\n(ρ = 0, popF = 73.75)",
    xlabel            = L"OLS estimate $\hat{\beta}_{\mathrm{OLS}}$",
    ylabel            = "Standard error")

plot!(plt_ols, dgp_zero.bols_dst[rejected_ols], dgp_zero.sols_dst[rejected_ols],
    seriestype        = :scatter,
    markersize        = 2.5,
    markeralpha       = 0.6,
    markerstrokewidth = 0,
    mc                = "#D55E00")

plot!(plt_ols, [0, 4], [0,  2], seriestype=:straightline, lc="#0072B2", linestyle=:dash, lw=1.5)
plot!(plt_ols, [0, 4], [0, -2], seriestype=:straightline, lc="#0072B2", linestyle=:dash, lw=1.5)


println("Correlation between OLS estimates and their SEs: ",
        round(cor(dgp_zero.bols_dst, dgp_zero.sols_dst), digits=4))

Correlation between OLS estimates and their SEs: 0.0021


rejected_iv_good = abs.(dgps[1,5].tiv_dst) .> 1.96

plt_iv_good = plot(dgps[1,5].biv_dst, dgps[1,5].siv_dst,
    seriestype        = :scatter,
    markersize        = 1.5,
    markeralpha       = 0.3,
    markerstrokewidth = 0,
    mc                = "#0072B2",
    legend            = false,
    size              = (800, 600),
    title             = "$(length(dgps[1,5].biv_dst)) IV estimates vs their standard errors\n(ρ = 0.10, popF = 73.75)",
    xlabel            = L"IV estimate $\hat{\beta}_{\mathrm{IV}}$",
    ylabel            = "Standard error")

plot!(plt_iv_good, dgps[1,5].biv_dst[rejected_iv_good], dgps[1,5].siv_dst[rejected_iv_good],
    seriestype        = :scatter,
    markersize        = 2.5,
    markeralpha       = 0.6,
    markerstrokewidth = 0,
    mc                = "#D55E00")

plot!(plt_iv_good, [0,4], [0, 2], seriestype=:straightline, lc="#0072B2", linestyle=:dash, lw=1.5)
plot!(plt_iv_good, [0,4], [0,-2], seriestype=:straightline, lc="#0072B2", linestyle=:dash, lw=1.5)


rejected_iv_weak = abs.(dgps[3,1].tiv_dst) .> 1.96

plt_iv_weak = plot(dgps[3,1].biv_dst, dgps[3,1].siv_dst,
    seriestype        = :scatter,
    markersize        = 1.5,
    markeralpha       = 0.3,
    markerstrokewidth = 0,
    mc                = "#0072B2",
    legend            = false,
    size              = (800, 600),
    title             = "$(length(dgps[3,1].biv_dst)) IV estimates vs their standard errors\n(ρ = 0.50, popF = 1.82)",
    xlabel            = L"IV estimate $\hat{\beta}_{\mathrm{IV}}$",
    ylabel            = "Standard error")

plot!(plt_iv_weak, dgps[3,1].biv_dst[rejected_iv_weak], dgps[3,1].siv_dst[rejected_iv_weak],
    seriestype        = :scatter,
    markersize        = 2.5,
    markeralpha       = 0.6,
    markerstrokewidth = 0,
    mc                = "#D55E00")

plot!(plt_iv_weak, [0,4], [0, 2], seriestype=:straightline, lc="#0072B2", linestyle=:dash, lw=1.5)
plot!(plt_iv_weak, [0,4], [0,-2], seriestype=:straightline, lc="#0072B2", linestyle=:dash, lw=1.5)


biv_w  = dgps[3,1].biv_dst
siv_w  = dgps[3,1].siv_dst
tiv_w  = dgps[3,1].tiv_dst
se_cap = quantile(siv_w, 0.99)

rejected_iv_trim = abs.(tiv_w) .> 1.96

plt_iv_trim = plot(biv_w, siv_w,
    seriestype        = :scatter,
    markersize        = 1.5,
    markeralpha       = 0.3,
    markerstrokewidth = 0,
    mc                = "#0072B2",
    xlims             = (-4, 4),
    ylims             = (0, min(4, se_cap)),
    legend            = false,
    size              = (800, 600),
    title             = "$(length(biv_w)) IV estimates vs their standard errors (outliers removed)\n(ρ = 0.50, popF = 1.82)",
    xlabel            = L"IV estimate $\hat{\beta}_{\mathrm{IV}}$",
    ylabel            = "Standard error")

plot!(plt_iv_trim, biv_w[rejected_iv_trim], siv_w[rejected_iv_trim],
    seriestype        = :scatter,
    markersize        = 2.5,
    markeralpha       = 0.6,
    markerstrokewidth = 0,
    mc                = "#D55E00")

plot!(plt_iv_trim, [0,4], [0, 2], seriestype=:straightline, lc="#0072B2", linestyle=:dash, lw=1.5)
plot!(plt_iv_trim, [0,4], [0,-2], seriestype=:straightline, lc="#0072B2", linestyle=:dash, lw=1.5)
vline!(plt_iv_trim, [0.50], lw=2, lc="#D55E00", linestyle=:dot)


keep = (abs.(biv_w) .<= 4) .& (siv_w .<= se_cap)
println("Correlation between IV estimates and their SEs (outliers removed): ",
        round(cor(biv_w[keep], siv_w[keep]), digits=4))

Correlation between IV estimates and their SEs (outliers removed): -0.1934


plt_grid = plot(
    layout     = (length(parms_rho), length(parms_F)),
    size       = (1800, 800),
    plot_title = "IV Estimate vs Standard Error for Different DGPs",
    plot_titlefontsize = 13)

for (i, rho) in enumerate(parms_rho)
    for (j, F) in enumerate(parms_F)
        k      = length(parms_F) * (i-1) + j
        b_vec  = dgps[i,j].biv_dst
        s_vec  = dgps[i,j].siv_dst
        t_vec  = dgps[i,j].tiv_dst
        rejected = abs.(t_vec) .> 1.96
        ylim   = min(4, quantile(s_vec, 0.99))

        plot!(plt_grid, b_vec, s_vec,
            seriestype        = :scatter,
            markersize        = 1,
            markeralpha       = 0.2,
            markerstrokewidth = 0,
            mc                = "#0072B2",
            subplot           = k,
            xlims             = (-4, 4),
            ylims             = (0, ylim),
            legend            = false,
            title             = L"\rho = %$(rho),\; F = %$(F)",
            titlefontsize     = 10)

        plot!(plt_grid, b_vec[rejected], s_vec[rejected],
            seriestype        = :scatter,
            markersize        = 1.5,
            markeralpha       = 0.4,
            markerstrokewidth = 0,
            mc                = "#D55E00",
            subplot           = k)

        plot!(plt_grid, [0,4], [0, 2], seriestype=:straightline, subplot=k,
            lc="#0072B2", linestyle=:dash, lw=1)
        plot!(plt_grid, [0,4], [0,-2], seriestype=:straightline, subplot=k,
            lc="#0072B2", linestyle=:dash, lw=1)
        vline!(plt_grid, [rho], subplot=k, lw=1.5, lc="#D55E00", linestyle=:dot)
    end
end

display(plt_grid)


# Uses a fine beta grid for the benchmark case (this may take a minute)
brange_fine = -1:0.025:1
brange, pow_ols, pow_iv, pow_ar = power_function(brange=brange_fine, F=73.75, rho=0)

plot(brange, [pow_ols, pow_iv],
    xticks   = -1:0.1:1,
    label    = ["OLS" "IV"],
    linewidth = 2.5,
    linestyle = [:solid :dash],
    lc        = ["#0072B2" "#E69F00"],
    legend    = :bottomright,
    size      = (900, 500),
    title     = L"Empirical Power: OLS vs IV ($\rho = 0$, popF = 73.75)",
    xlabel    = L"True $\beta$",
    ylabel    = "Power")
hline!([0.05], linestyle=:dot, lc=:gray50, lw=1.2, label="5% nominal size")
ylims!(0, 1)


# This cell computes 15 power curves (takes a few minutes)
plt_pow = plot(
    layout     = (length(parms_rho), length(parms_F)),
    size       = (1800, 800),
    plot_title = "Empirical Power: OLS (solid) vs IV (dashed)",
    plot_titlefontsize = 13)

for (i, rho) in enumerate(parms_rho)
    for (j, F) in enumerate(parms_F)
        k = length(parms_F) * (i-1) + j
        brange, pow_ols, pow_iv, pow_ar = power_function(F=F, rho=rho)
        emp_size = round(100 * pow_iv[brange .== 0.0][], digits=2)

        plot!(plt_pow, brange, [pow_ols, pow_iv],
            label     = ["OLS" "IV"],
            linestyle = [:solid :dash],
            lc        = ["#0072B2" "#E69F00"],
            legend    = false,
            subplot   = k,
            margin    = 5mm,
            title     = L"\rho = %$(rho),\; F = %$(F)" * "\nSize = $(emp_size)%",
            titlefontsize = 9)

        hline!([0.05], linestyle=:dot, lc=:gray50, lw=1, subplot=k, legend=false)
        ylims!(0, 1)
        xlabel!(L"True $\beta$")
    end
end

display(plt_pow)


# This cell re-uses the power_function results (re-computes to match earlier)
plt_ar = plot(
    layout     = (length(parms_rho), length(parms_F)),
    size       = (1800, 800),
    plot_title = "Empirical Power: AR (solid) vs IV (dashed)",
    plot_titlefontsize = 13)

for (i, rho) in enumerate(parms_rho)
    for (j, F) in enumerate(parms_F)
        k = length(parms_F) * (i-1) + j
        brange, pow_ols, pow_iv, pow_ar = power_function(F=F, rho=rho)
        emp_size = round(100 * pow_ar[brange .== 0.0][], digits=2)

        plot!(plt_ar, brange, [pow_ar, pow_iv],
            label     = ["AR" "IV"],
            linestyle = [:solid :dash],
            lc        = ["#009E73" "#E69F00"],
            legend    = false,
            subplot   = k,
            margin    = 5mm,
            title     = L"\rho = %$(rho),\; F = %$(F)" * "\nSize = $(emp_size)%",
            titlefontsize = 9)

        hline!([0.05], linestyle=:dot, lc=:gray50, lw=1, subplot=k, legend=false)
        ylims!(0, 1)
        xlabel!(L"True $\beta$")
    end
end

display(plt_ar)

Population F	Sample F (95th pct)	Worst-case size
1.82	8.96	15%
2.30	10.00	13.5%
5.78	16.38	10%
10.00	23.10	8.6%
29.44	50.00	6.4%
73.75	104.70	5.0%

Recommendation	Rationale
Do not use the standard IV t-test for inference when instruments might be weak	Power asymmetry leads to misleading rejections in one direction
Use the AR test instead	Symmetric power, valid size even under weak instruments
Sample F should be much larger than 10	Even $\widehat{F} = 23$ (popF = 10) produces poor power
Report the first-stage F transparently	Readers can judge the severity of the weak-IV problem

Lecture 9: Power, Bias, and the AR Test — A Deeper Look at Weak Instruments¶

Summary of Week 8¶

Road Map for Week 9¶

Keane and Neal (2024)¶

Practical Lessons from Week 8¶

Data Generating Process (DGP)¶

Julia Functions¶

Creating DGPs¶

Standard Errors¶

Reading the Histograms¶

Plotting IV Estimates vs Their Standard Errors¶

Interpretation¶

A Disturbing Pattern¶

Why Are Large IV Estimates Associated with Small Standard Errors?¶

The Compounding Effect on Standard Errors¶

Power Functions¶

Reading the Power Curve¶

Digression: Effect Size¶

Reading the Power Grid¶

The AR Test¶

Discussion and Takeaways¶

What the AR vs IV Comparison Shows¶

Practical Recommendations (Keane & Neal)¶

How Large Should F Be?¶

Connection to Week 7¶

Values for $\rho$ of practical relevance
0.00 (no endogeneity)
0.10
0.30
0.50