Foto del docente

Matteo Farnè

Ricercatore a tempo determinato tipo b) (senior)

Dipartimento di Scienze Statistiche "Paolo Fortunati"

Settore scientifico disciplinare: SECS-S/01 STATISTICA

Contenuti utili

trimmed factorial k-means with automatic selection of alpha

% tfkm carries out trimmed factorial k-means algorithm, introduced in
%
% Farnè, M. and Vouldis, A. T. (2017), 'Business models of the Banks in the Euro Area',
% No. 2070. ECB Working Paper.
%
% This method is a robustified version of the factorial k-means algorithm developed
% in Vichi, M., & Kiers, H. A. (2001). Factorial k-means analysis for two-way data.
% Computational Statistics & Data Analysis, 37(1), 49-64.
%
% The INPUT arguments are:
% z: n (objects) times p (variables) data matrix .
% alpha_all: vector of allowed proportions of outliers. alpha=0 means that the original
% factorial k-means is applied.
% r_all: vector of tested number of factors.
% g_all: vector of te numbers of clusters.
% cov_yes=1 if the input is the covariance matrix, 0 if the input is the
% correlation matrix.
% r_G: 0 for a rational initialization of loadings, memberships and
% centroids (default), 1 for a random initialization (see Farnè, M. and
% Vouldis, A. T. (2017) for the details).
% N: number of replicates (defaults to 100).
%
% The OUTPUT arguments are:
% fac_scores: the matrix of factor scores for each object.
% Member_mod: the vector of group membership (outliers=0).
% group_num: the number of observations in each cluster (excluding
% outliers).
% out_tfkm: outlier identifiers.
% n_out_tfkm: number of identified outliers.
% Member: the vector of group membership (without excluding outliers)
% group_comp: the number of observations in each cluster (without excluding outliers)
% Hartigan: the matrix of Hartigan statistics for each rank-number of
% groups combination.
% i1: optimal rank according to the maximization of Hartigan statistics.
% i2: optimal number of groups according to the maximization of Hartigan
% statistics.
% model_select: 1 if rank and n_groups are selected maximizing Hartigan statistics, 0 otherwise.
% Available only if g_all has length larger than one. In case its length is one while
% the length of r_all is larger than 1, the first rank of r_all is automatically selected.
% Please note that the length of g_all must be such that Hartigan
% statistics may be computed for all pair (r,n_groups) with r<=n_groups-1.
% If we want to consider such a pair (r,n_groups), n_groups+1 must be
% contained in g_all. If no pair satisfies this condition, no comparison is
% available for alpha, and the first value of alpha_all is automatically
% selected.

function[output_all]=tfkm(z,alpha_all,r_all,g_all,cov_yes,r_G,N)

if~exist('r_G','var')
r_G=0;
end

if~exist('N','var')
N=100;
end


if length(alpha_all)==1
%if length(alpha_all)==0
output_all=tfkm(z,alpha_all,r_all,g_all,cov_yes,r_G,N)
return;

else

for out_prop=1:length(alpha_all)

output_all=tfkm(z,alpha_all(out_prop),r_all,g_all,cov_yes)

if iscell(output_all)==0
return;
else
if output_all{1,15}==0
return;
else
Hartigan_all(out_prop)=output_all{1,15}(output_all{1,17},output_all{1,19});
%=Hartigan(output_all{1,17},output_all{1,19});
rank_all(out_prop)=output_all{1,17};
cluster_all(out_prop)=output_all{1,19};
end
end
end
end

[min2 ii1]=max(Hartigan_all)
i1=rank_all(ii1)
i2=cluster_all(ii1)
alpha=alpha_all(ii1);

r_yes=i1;
g_yes=i2;

Hartigan=min2;

if alpha==0%%~exist('robust','var') &&
robust=0;
end

if ne(alpha,0)==1%%~exist('robust','var') &&
robust=1;
end

%%
n=size(z,1);%%number of objects
p=size(z,2);%%number of variables
if cov_yes==1
C=cov(z)%%covariance matrix
else
C=corr(z);
end%%correlation matrix
%r_length=length(r_all);
%g_length=length(g_all);
%%

% OPTIMAL SOLUTION IMPLEMENTATION
% Initialize rank
[U_r, D_r]=svds(C,i1);
R=zeros(p,p);
V=zeros(p,p);
E=zeros(p,p);
v=zeros(p,1);
Random=zeros(p,p,N);
r=i1;
p=length(C);

clear Random E V
for number=1:N
K=rand(p)*eye(p);
rank(K);
% Here a Gram-Schmidt algorithm is used to derive the orthogonal base of the space defined by K
E(:,1)=K(:,1);

for j=2:p
for i=1:(j-1)
R(i,j)=dot(K(:,j),E(:,i))/(norm(E(:,i))^2);
end
for i=1:(j-1)
V(:,i)=R(i,j)*E(:,i);
end

for h=1:p
v(h,1)=sum(V(h,1:(j-1)));
end
E(:,j)=K(:,j)-v;
end

rank(E);
E'*E;

for i=1:p
E(:,i)=E(:,i)/norm(E(:,i));
end
rank(E);
E;
E'*E;

Random(:,:,number)=E*U_r;
end

% Initialize n_groups
n_groups=i2;
group=1:n_groups;
RawData=z;
n_Raw=size(RawData,1);
%r_G=0;
clear G Random_G centr maxRaw minRaw rangeRaw dist_G t_hotelling_pre F_pre pace

% The clustering initialisation loop
for number=1:N

for i=1:p
maxRaw(i)=max(RawData(:,i));
minRaw(i)=min(RawData(:,i));
rangeRaw(i)=maxRaw(i)-minRaw(:,i);
end

vvar=var(z');
mmean=mean(z');

if r_G==1
for j=1:n_Raw
casual(j)=randi(n_groups);
G(j,casual(j))=1;
end
end

if r_G==0
fac_pre=z*Random(:,:,number);
for j=1:n_Raw
dist_G(:,j)=(fac_pre(j,:)-mean(fac_pre))';
t_hotelling_pre(:,j)=n_Raw*dist_G(:,j)'*inv(cov(fac_pre))*dist_G(:,j);
end

q_G=quantile(t_hotelling_pre,2*n_groups-1);

% The distances of objects from the various centroids is calculated
for j=1:n_Raw
for m=1:n_groups
F_pre(j,m)=norm(t_hotelling_pre(j)-q_G(2*m-1));
end
end

% Here the assignment of objects to clusters is done, based on the t-hotelling variance measure

for j=1:n_Raw
for m=1:n_groups
if min(F_pre(j,:))==F_pre(j,m)
G(j,m)=1;
else
G(j,m)=0;
end
end
end
end

Random_G(:,:,number)=G;

clear G
end

clear YY GG UU_GG F IQR Member
n_it=0;
%robust=1;

for number=1:N
G=Random_G(:,:,number);
U_G=Random(:,:,number);
Y_bar=inv(G'*G)*G'*RawData*U_G;
ob_pre=norm(RawData*U_G-G*Y_bar)^2;
diff=ob_pre;
n_it=0;

% The main loop of clustering starts. It runs while diff>0
while ((diff))>0

for j=1:n

for m=1:n_groups
count=zeros(1,n_groups);
count(m)=1;
F(j,m)=norm(z(j,:)*U_G-count*Y_bar)^2;
end

% each object is assigned to clusters according to the minimum distance
for m=1:n_groups
if min(F(j,:))==F(j,m)
G(j,m)=1;
else
G(j,m)=0;
end
end

end

clear Member
for j=1:n
for m=1:n_groups
if G(j,m)==1
Member(j)=m;
end
end
end

% Performs Step 1 in (Vichi and Kiers, 2001, p. 57), so in case the loop ends,
% the centroids and groupings are updated

centroids=Y_bar;
fac_scores=(z)*U_G;
cen_true=G*centroids;

if robust==1
for m=1:n_groups
for j=1:n
diff_scores(:,j)=fac_scores(j,:)'-cen_true(Member(j),:)';
t_hotelling(j)=n*diff_scores(:,j)'*inv(cov(fac_scores))*diff_scores(:,j);
t_test(j)=(n-r)/(r*(n-1))*t_hotelling(j);
end
end

thr_clust_max=quantile(t_test,1-alpha);

clear flag

boundmax=thr_clust_max;

% The objects exceeding the threshold, are flagged as outliers
for j=1:n
if t_test(j)>boundmax
flag(j)=1;
else flag(j)=0;
end
end

% outliers are removed from the groups
for j=1:n
if flag(j)==1
G(j,:)=zeros(1,n_groups);
end
end
end

if rank(G)==size(G,2)
[U_G, D_G]=svds(z'*(G*inv(G'*G)*G'-eye(n))*z,i1);
% extraction of eigenvectors in Step 2
% see ten Berge, J. M. (1993). Least squares optimization in multivariate
% analysis. Leiden, The Netherlands: DSWO Press, Leiden University.

Y_bar=inv(G'*G)*G'*z*U_G;
if n_it>1
ob_pre=ob_post;
end
ob_post= norm(z*U_G-G*Y_bar)^2;
diff=ob_post-ob_pre;
diff_pre=diff;
n_it=n_it+1;
if diff<0
YY(:,:,number)=Y_bar;
GG(:,:,number)=G;
UU_GG(:,:,number)=U_G;
clear obob
obob(number)=ob_post;
diffdiff(number)=diff;
N_it(number)=n_it;
end
else diff=-1;
end
end
if diff==0
obob(number)=0;
N_it(number)=0;
end

end

% We choose the solution among the set (=N) of initial starts.
% The solution with the minimum loss function is chosen.


if sum(N_it)~=0
gg=find(obob==min(obob(N_it~=0)));
find(obob==min(obob(N_it~=0)));
end

n_it=0;
%robust=1;

for number=gg(1):gg(1)
G=Random_G(:,:,number);
U_G=Random(:,:,number);
Y_bar=inv(G'*G)*G'*z*U_G;
ob_pre=norm(z*U_G-G*Y_bar)^2;
diff=ob_pre;
n_it=0;
while ((diff))>0
for j=1:n
for m=1:n_groups
count=zeros(1,n_groups);
count(m)=1;
F(j,m)=norm(z(j,:)*U_G-count*Y_bar)^2;
end

for m=1:n_groups
if min(F(j,:))==F(j,m)
G(j,m)=1;
else
G(j,m)=0;
end
end

end

clear Member
for j=1:n
for m=1:n_groups
if G(j,m)==1
Member(j)=m;
end
end
end

centroids=Y_bar;
fac_scores=(z)*U_G;
cen_true=G*centroids;

if robust==1
clear diff_scores t_hotelling
for m=1:n_groups
for j=1:n
diff_scores(:,j)=fac_scores(j,:)'-cen_true(Member(j),:)';
t_hotelling(j)=n*diff_scores(:,j)'*inv(cov(fac_scores))*diff_scores(:,j);
t_test(j)=(n-r)/(r*(n-1))*t_hotelling(j);
end
end

IQR=quantile(t_test,0.75)-quantile(t_test,0.25);
thr_clust_max=quantile(t_test,1-alpha);

clear flag
boundmax=thr_clust_max;
for j=1:n
if t_test(j)>boundmax
flag(j)=1;
else flag(j)=0;
end
end

for j=1:n
if flag(j)==1
G(j,:)=zeros(1,n_groups);
end
end
end

if rank(G)==size(G,2)
[U_G, D_G]=svds(z'*(G*inv(G'*G)*G'-eye(n))*z,i1);
Y_bar=inv(G'*G)*G'*z*U_G;
if n_it>1
ob_pre=ob_post;
end
ob_post= norm(z*U_G-G*Y_bar)^2;
diff=ob_post-ob_pre;
diff_pre=diff;
n_it=n_it+1;
if diff<0
YY(:,:,number)=Y_bar;
GG(:,:,number)=G;
UU_GG(:,:,number)=U_G;
obob(number)=ob_post;
diffdiff(number)=diff;
end
else diff=-1;
end
end
end

%%OUTPUTS

if robust==1
Member;
Member_mod=Member;
for j=1:n
if flag(j)==1
Member_mod(j)=0;
end
end
Member_mod;

out_tfkm=find(Member_mod==0);
n_out_tfkm=length(out_tfkm);

clear group_num % gives number of members of each group, excluding outliers!
for m=1:n_groups
group_num(m)=sum(flag(Member_mod==m));
end
group_num;
end

% here the composition also including outliers
clear group_comp
for m=1:n_groups
group_comp(m)=length(find(Member==m));
end
sum(group_comp);
group_comp;

if robust==0
out_tfkm=0;
n_out_tfkm=[];
group_num=group_comp;
Member_mod=Member;
end

model_select=1;

output_all={fac_scores,'fac_scores',Member_mod,'Member_mod',group_num,'group_num',out_tfkm,'out_tfkm',n_out_tfkm,'n_out_tfkm',Member,'Member',group_comp,'group_comp',Hartigan,'Hartigan',r_yes,'rank',g_yes,'cluster_number',model_select,'model_select',alpha,'alpha'}


end