%% Blog 18

close all;

addpath(genpath('../lib/brewermap'));

%% Simulate data

close all;
N = 100;
X1=randn(N,1);
X2=zscore(0.25*X1+randn(N,1)*0.8);
Y=zscore(1.2+0.5*X1+0.7*X2+randn(N,1)*0.8);
figure,scatter(X1,Y);
xlabel('X1');
ylabel('Y');
figure,scatter(X2,Y);
xlabel('X2');
ylabel('Y');
figure,scatter(X1,X2);
xlabel('X1');
ylabel('X2');
figure,scatter3(X1,X2,Y);
xlabel('X1');
ylabel('X2');
zlabel('Y');

data = [Y,X1,X2];
data = data / max(abs(data(:)));
T=array2table(data,'VariableNames',{'Y','X1','X2'});
writetable(T,'mlr_3vars_n100.csv')



%% Overfitting

M = 2:50;
N = 2:100;
N_rnd = 1000;

R2 = zeros(length(N),length(M));

for i = 1 : length(M)
    m = M(i);
   for j = 1 : length(N)
       n = N(j);
       R2_k = 0;
       for k = 1 : N_rnd
           % Generate random data
           y = rand(n,1);
           ym = mean(y);
           X = rand(n,m);
           X(:,1) = 1;
           beta = X \ y;
           yhat = X*beta;
           SS_r = sum((y-yhat).^2);
           SS_t = sum((y-ym).^2);
           R2_k = R2_k + 1 - (SS_r/SS_t);
       end
       R2(j,i) = R2_k / N_rnd;
   end
end

%% Plot overfitting
close all;

h = figure;
h.Color = 'w';

h.Position(3:4) = [800 600];

imagesc(R2);

ax = gca;
ax.FontSize = 18;

hh = ylabel('Sample Size ($n$)', 'interpreter', 'latex');
hh.FontSize = 27;
hh = xlabel('Number of Predictors ($k$)', 'interpreter', 'latex');
hh.FontSize = 27;


hh = title('Goodness of Fit ($R^2$)', 'interpreter', 'latex');
hh.FontSize = 31;

cmap = flipud(brewermap(50,'YlGnBu'));
colormap(cmap);
colorbar;

ax.XTick = 0:10:length(M);
ax.XTickLabel =  0:10:length(M);
ax.YTick = 0:10:length(N);
ax.YTickLabel =  0:10:length(N);
