% Code to reproduce the results and figures from "What gender gap in chess
% achievement" by Wei Ji Ma for ChessBase, Oct 7, 2020.
% To run this code, first download a rating list, convert to CSV, and make
% sure that the columns are:
% A: Federation
% B: Gender
% C: Rating
% D: Birth year
% E: Name
% Column E (Name) is barely used so if you don't include that, just comment
% out the few lines in which it is used. Column D is not needed if you
% don't want to exclude juniors.
clear; close all;
set(0,'DefaultLineLineWidth',2);
set(0,'DefaultAxesFontSize',16);
%% Thought experiment
nA = 10;
nB = 2;
ndraws = 1e6;
max_A = max(randi(100,[nA ndraws]));
max_B = max(randi(100,[nB ndraws]));
mean(max_A)
mean(max_B)
%% Read data
country = readmatrix('standard_rating_list_edited.csv','range','A:A','OutputType','string');
country = string(cell2mat(country));
gender = readmatrix('standard_rating_list_edited.csv','range','B:B','OutputType','string');
gender = string(cell2mat(gender));
rating = readmatrix('standard_rating_list_edited.csv','range','C:C');
birth = readmatrix('standard_rating_list_edited.csv','range','D:D');
name = readmatrix('standard_rating_list_edited.csv','range','E:E','OutputType','string');
% Removing junior players
idx = birth<2000 & strcmp(country,"IND");
gender = gender(idx);
rating = rating(idx);
name = name(idx);
% Splitting by gender
rating_M = rating(strcmp(gender,"M"));
rating_F = rating(strcmp(gender,"F"));
%% Basic stats
n = length(rating)
n_M = sum(strcmp(gender,"M"))
n_F = sum(strcmp(gender,"F"))
pF = n_F/n * 100
max_M = round(max(rating_M))
max_F = round(max(rating_F))
best_M = name(rating==max_M)
best_F = name(rating==max_F)
[Y,I] = sort(rating,'descend');
I_top = I(1:20);
gender(I_top)
name(I_top)
mu_M = mean(rating_M)
mu_F = mean(rating_F)
std_M = round(std(rating_M))
std_F = round(std(rating_F))
rating_edges = 1000:50:2800;
rating_centers = rating_edges(1:end-1)+25;
h_M = histcounts(rating_M, rating_edges);
h_F = histcounts(rating_F, rating_edges);
hn_M = h_M/n_M;
hn_F = h_F/n_F;
%% Permutation test
ndraws = 100000;
n_large = n_M;
n_small = n_F;
max_large = NaN(1,ndraws);
max_small = NaN(1,ndraws);
for i = 1:ndraws
rating_perm = rating(randperm(n));
draw_large = rating_perm(1:n_large);
draw_small = rating_perm(n_large + 1:end);
max_large(i) = max(draw_large);
max_small(i) = max(draw_small);
end
mean(max_large)
mean(max_small)
delta = max_large - max_small;
delta_mean = mean(delta)
delta_std = std(delta)
%% Plots
figure;
plot(rating_centers, [h_M; h_F],'o-')
ylabel('Number of players')
xlabel('Rating (binned)')
xlim([1000 2800])
set(gca,'LineWidth',1)
grid on;
legend('M','F')
title('Rating distributions of Indian players by gender')
figure;
plot(rating_centers, [hn_M; hn_F],'o-')
ylabel('Proportion of players')
xlabel('Rating (binned)')
grid on;
set(gca,'ytick',[0:0.02:0.1])
set(gca,'LineWidth',1)
legend('M','F')
title('Normalized rating distributions of Indian players by gender')
xlim([1000 2800])
figure;
hist(delta, -100:50:500)
set(gca,'xtick',-100:100:500)
xlabel('Difference between best M and best F')
ylabel('Frequency/1000')