using Distributions, Random 

function dgp_keane_neal(; b=0, n=1000, F, rho)

    """
    Generates one sample of size n following the DGP of Keane & Neal (2024) page 193

    ### Input

    - `b`   -- structural coefficient beta (scalar)
    - `n`   -- sample size
    - `F`   -- reduced form F-stat 
    - `rho` -- degree of endogeneity

    ### Output

    - `x`   -- (n by 1) vector representing endogenous regressor
    - `y`   -- (n by 1) vector representing outcome variable
    - `z`   -- (n by 1) vector representing instrumental variable
    """

    p = sqrt(F/n)
    u = rand(Normal(0, 1), n)
    eta = rand(Normal(0, 1), n)
    z = rand(Normal(0, 1), n)
    v = rho*u + sqrt(1-rho^2)*eta
    x = p*z .+ v
    y = b*x .+ u

    return (; x, y, z)

end

dgp_keane_neal (generic function with 1 method)


function ols_estimator(x, y)

    """
    Implements OLS estimation of linear model with one exogenous regressor.

    ### Input

    - `x`   -- (n by 1) vector representing regressor
    - `y`   -- (n by 1) vector representing outcome variable

    ### Output

    - `bhat` --OLS estimate of beta
    - `se`   --standard error of OLS estimate
    - `t`    --t-statistic of OLS estimator
    """

        # OLS estimator
        bhat= x\y

        # standard error
        uhat = y-x*bhat
        s = uhat'uhat/length(y)
        se= sqrt(s/(x'x))

        # t-statistic (absolute value)
        t = bhat/se

        return (; bhat, se, t) # returning named tuple

end

ols_estimator (generic function with 1 method)


function iv_estimator(x, y, z)

    """
    Implements IV estimation of linear model with one endogenous variable, and one instrument.

    ### Input

    - `x`   -- (n by 1) vector representing endogenous regressor
    - `y`   -- (n by 1) vector representing outcome variable
    - `z`   -- (n by 1) vector representing instrumental variable

    ### Output

    - `biv`  -- IV estimate of beta
    - `se`   -- standard error of IV estimate
    - `t`    -- t-statistic of IV estimator
 
    ### Notes

    For calculation of standard error, we're using the formula on page 190 of Keane & Neal (2024)
    """

    # IV estimator
    bhat= (z'y)/(x'z)

    # standard error (using formula in Keane & Neal (2024))
    n = length(y)
    pihat = z\x         # reduced form coefficient estimate
    TSS = n*pihat^2*var(z)
    uhat = y-x*bhat
    s = uhat'*uhat/n
    se = sqrt(s/TSS)

    # t-statistic (absolute value)
    t = bhat/se
    
    return (; bhat, se, t) # returning `named tuple`

end

iv_estimator (generic function with 1 method)


function simulate_distribution(; b=0, F, rho, rep=10000)

    """
    Creates finite sample distributions of 
    - IV estimator,
    - standard error of IV estimator
    - tstat of IV estimator

    How does it create finite sample distribution? It creates `rep` number of DGPs and each time
    calculates IV estimator, its standard error, and tstat.
    
    ### Input

    - `F`       -- reduced form F-stat 
    - `rho`     -- degree of endogeneity
    - `rep`     -- number of repititions/simulations run

    ### Output
    
    - `bols_dst`    -- (rep by 1) vector collecting rep simulations of OLS estimator
    - `sols_dst`    -- (rep by 1) vector collecting rep simulations of standard error of OLS estimator
    - `tols_dst`    -- (rep by 1) vector collecting rep simulations of tstat of OLS estimator
    - `biv_dst`     -- (rep by 1) vector collecting rep simulations of IV estimator
    - `siv_dst`     -- (rep by 1) vector collecting rep simulations of standard error of IV estimator
    - `tiv_dst`     -- (rep by 1) vector collecting rep simulations of tstat of IV estimator
    - `tar_dst`     -- (rep by 1) vector collecting rep simulations of AR-statistic
    """

    bols_dst = Array{Float64}(undef, rep)
    sols_dst = Array{Float64}(undef, rep)
    tols_dst = Array{Float64}(undef, rep)

    biv_dst = Array{Float64}(undef, rep)
    siv_dst = Array{Float64}(undef, rep)
    tiv_dst = Array{Float64}(undef, rep)

    ar_dst = Array{Float64}(undef, rep)
    
    for i = 1:rep

        x, y, z = dgp_keane_neal(b=b, F=F, rho=rho)

        # calculating simulated distribution for bols, seols, and tols
        bols_dst[i], sols_dst[i], tols_dst[i] = ols_estimator(x, y)

        # calculating simulated distribution for biv, seiv, and tiv
        biv_dst[i], siv_dst[i], tiv_dst[i] = iv_estimator(x, y, z)

        # calculating AR statistic
        # by regressing Y on Z
        ar_dst[i] = ols_estimator(z, y).t

    end
    
    return (; bols_dst, sols_dst, tols_dst, biv_dst, siv_dst, tiv_dst, ar_dst)

end

simulate_distribution (generic function with 1 method)


# let's represent these DGPs by tuples
parms_rho = (0.10, 0.50, 0.90)
parms_F = (1.82, 2.30, 73.75)

# create an object `dgps` that collects all DGPs that follow these nine parameter combinations:
dgps = [simulate_distribution(rho = rho, F = F) for rho in parms_rho, F in parms_F];


using Plots
using Plots.PlotMeasures: mm
Plots.theme(:wong2)

plt = plot(layout=(length(parms_F),length(parms_rho)), size = (1800, 800),
        plot_title = "Empirical Distriution of t-statistic for different DGPs")

for (i, rho) in enumerate(parms_rho)
    for (j, F) in enumerate(parms_F)
        #b, se, t = dgps[i, j]
        histogram!(plt, 
            dgps[i,j].tiv_dst,
            normalize = true, 
            subplot = length(parms_F)*(i-1)+j, 
            bins = range(-4, 4, length=51), 
            legend=false, 
            title = "\\rho = $rho and popF = $F")
    end
end

display(plt)


using Plots
using Plots.PlotMeasures: mm
Plots.theme(:wong2)

F_range = 0:.25:20
emp_size = [mean(abs.(simulate_distribution(F=F, rho=1).tiv_dst .> 1.96)) for F in F_range]

plot(F_range, emp_size, 
    legend=false, 
    xlab="pop F", 
    xticks=0:.5:20,
    size=(1400,800),
    left_margin=20mm)
hline!([0.05, 0.05], linestyle=:dash)
hline!([0.100, 0.100], linestyle=:dash)
hline!([0.150, 0.150], linestyle=:dash)
annotate!(-1.5, 0.150, text("15% line", 10))
annotate!(-1.5, 0.100, text("10% line", 10))
annotate!(-1.5, 0.05, text("5% line", 10))
ylims!(0,1)
title!("Empirical Size at Varying Values of popF \\n (Degree of endogeneity \\rho = 1)")


using Distributions

F1 = NoncentralChisq(1, 1.82)
F2 = NoncentralChisq(1, 2.30)
F3 = NoncentralChisq(1, 5.78)
F4 = NoncentralChisq(1,10.00)
F5 = NoncentralChisq(1,29.44)
F6 = NoncentralChisq(1,73.75)

plot(x -> cdf(F1, x), 0, 120, 
    legend = :bottomright, 
    label = "popF=1.82",
    size = (800, 600),
    xticks = 0:10:100,
    title = "Conversion: Population F vs Sample F \\n Plots of Various Noncentral \\chi^{2} Distributions")
plot!(x -> cdf(F2, x), label = "popF=2.30", lw=3)
plot!(x -> cdf(F3, x), label = "popF=5.78")
plot!(x -> cdf(F4, x), label = "popF=10.00")
plot!(x -> cdf(F5, x), label = "popF=29.44")
plot!(x -> cdf(F6, x), label = "popF=73.75")
hline!([0.95,0.95], linestyle=:dash, label=false)
vline!([10,10], linestyle=:dash, label=false)
annotate!(65, 0.90, text("95% line", 10))


# replicating parts of Figure 1 of K&N 

using Plots
using Plots.PlotMeasures: mm

rho_range = 0:0.10:1
emp_size = [mean(abs.(simulate_distribution(F=1.82, rho=rho).tiv_dst .> 1.96)) for rho in rho_range]

plot(rho_range, emp_size, 
    legend=false, 
    xlab="Degree of Endogeneity \\rho", 
    left_margin=20mm,
    size=(800,600))
hline!([0.05, 0.05])
annotate!(.10, 0.06, text("5% line", 10))
ylims!(0,.20)
title!("Empirical Size at Varying Degrees of \\rho \\n (popF = 1.82)")

Case	population F	sample F	Worst case empirical size (when $\rho=1$)
1	1.82	8.96	15\%
2	2.30	10.00	13.5\%
3	5.78	16.38	10\%
4	10.00	23.10	8.6\%
5	29.44	50.00	6.4\%
6	73.75	104.70	5.0\%

Lecture 7: When Are Instruments Strong? Understanding Stock & Yogo (2005)¶

Data Generating Process (DGP)¶

Julia Functions¶

Creating Simulated Distributions¶

Distribution of IV estimator¶

Empirical Size: Stock & Yogo's Idea to Limit the Damage¶

Digression: Trusty t-test¶

Stock & Yogo's (2005) Worst Case Cutoffs¶

Is the Worst Case too Pessimistic?¶

DGP #	population F	$\rho$	Description
1	1.82	0.1	weak IV, low degree of endogeneity
2	2.30	0.1	rule of thumb IV, low degree of endogeneity
3	73.75	0.1	strong IV, low degree of endogeneity
4	1.82	0.5	weak IV, moderage degree of endogeneity
5	2.30	0.5	rule of thumb IV, moderate degree of endogeneity
6	73.75	0.5	strong IV, moderate degree of endogeneity
7	1.82	0.9	weak IV, high degree of endogeneity
8	2.30	0.9	rule of thumb IV, high degree of endogeneity
9	73.75	0.9	strong IV, high degree of endogeneity