% Code to reproduce the results and figures from "What gender gap in chess
% achievement" by Wei Ji Ma for ChessBase, Oct 7, 2020.
% To run this code, first download a rating list, convert to CSV, and make
% sure that the columns are:
% A: Federation
% B: Gender
% C: Rating
% D: Birth year
% E: Name
% Column E (Name) is barely used so if you don't include that, just comment
% out the few lines in which it is used. Column D is not needed if you
% don't want to exclude juniors.

clear; close all;
set(0,'DefaultLineLineWidth',2);
set(0,'DefaultAxesFontSize',16);

%% Thought experiment

nA = 10;
nB = 2;
ndraws = 1e6;
max_A = max(randi(100,[nA ndraws]));
max_B = max(randi(100,[nB ndraws]));
mean(max_A)
mean(max_B)

%% Read data

country = readmatrix('standard_rating_list_edited.csv','range','A:A','OutputType','string');
country = string(cell2mat(country));
gender  = readmatrix('standard_rating_list_edited.csv','range','B:B','OutputType','string');
gender  = string(cell2mat(gender));
rating  = readmatrix('standard_rating_list_edited.csv','range','C:C');
birth   = readmatrix('standard_rating_list_edited.csv','range','D:D');
name    = readmatrix('standard_rating_list_edited.csv','range','E:E','OutputType','string');

% Removing junior players
idx = birth<2000 & strcmp(country,"IND");
gender  = gender(idx);
rating  = rating(idx);
name    = name(idx);

% Splitting by gender
rating_M = rating(strcmp(gender,"M"));
rating_F = rating(strcmp(gender,"F"));

%% Basic stats
n = length(rating)
n_M = sum(strcmp(gender,"M"))
n_F = sum(strcmp(gender,"F"))
pF = n_F/n * 100

max_M = round(max(rating_M))
max_F = round(max(rating_F))

best_M = name(rating==max_M)
best_F = name(rating==max_F)

[Y,I] = sort(rating,'descend');
I_top = I(1:20);
gender(I_top)
name(I_top)

mu_M   = mean(rating_M)
mu_F   = mean(rating_F)

std_M = round(std(rating_M))
std_F = round(std(rating_F))

rating_edges = 1000:50:2800;
rating_centers = rating_edges(1:end-1)+25;
h_M = histcounts(rating_M, rating_edges);
h_F = histcounts(rating_F, rating_edges);
hn_M = h_M/n_M;
hn_F = h_F/n_F;


%% Permutation test

ndraws = 100000;
n_large = n_M;
n_small = n_F;

max_large = NaN(1,ndraws);
max_small = NaN(1,ndraws);

for i = 1:ndraws
    rating_perm = rating(randperm(n));
    draw_large = rating_perm(1:n_large);
    draw_small = rating_perm(n_large + 1:end);
    
    max_large(i) = max(draw_large);
    max_small(i) = max(draw_small);
end

mean(max_large)
mean(max_small)

delta = max_large - max_small;
delta_mean = mean(delta)
delta_std = std(delta)


%% Plots

figure;
plot(rating_centers, [h_M; h_F],'o-')
ylabel('Number of players')
xlabel('Rating (binned)')
xlim([1000 2800])
set(gca,'LineWidth',1)
grid on;
legend('M','F')
title('Rating distributions of Indian players by gender')

figure;
plot(rating_centers, [hn_M; hn_F],'o-')
ylabel('Proportion of players')
xlabel('Rating (binned)')
grid on;
set(gca,'ytick',[0:0.02:0.1])
set(gca,'LineWidth',1)
legend('M','F')
title('Normalized rating distributions of Indian players by gender')
xlim([1000 2800])

figure;
hist(delta, -100:50:500)
set(gca,'xtick',-100:100:500)
xlabel('Difference between best M and best F')
ylabel('Frequency/1000')