```function csS = som_stats(D,varargin)

%SOM_STATS Calculate descriptive statistics for the data.
%
% csS = som_stats(D,[sort]);
%
%  csS = som_stats(D);
%  csS = som_stats(D,'nosort');
%  som_table_print(som_stats_table(csS))
%
%  Input and output arguments ([]'s are optional):
%   D           (matrix) a matrix, size dlen x dim
%               (struct) data or map struct
%   [sort]      (string) 'sort' (default) or 'nosort'
%                        If 'nosort' is specified, the data is not
%                        sorted, and therefore the values of
%                        nunique, uvalues, ucount, fvalues, fcount, and tiles fields
%                        are not calculated. This may be useful if
%                        there is a very large amount of data, and
%                        one wants to reduce calculation time.
%
%   csS         (cell array) size dim x 1, of statistics structs with
%                        the following fields
%      .type             (string) 'som_stat'
%      .name             (string) name of the variable
%      .normalization    (struct array) variable normalization (see SOM_NORMALIZE)
%      .ntotal           (scalar) total number of values
%      .nvalid           (scalar) number of valid values (not Inf or NaN)
%      .min              (scalar) minimum value
%      .max              (scalar) maximum value
%      .mean             (scalar) mean value (not Inf or NaN)
%      .std              (scalar) standard deviation (not Inf or NaN)
%      .nunique          (scalar) number of unique values
%      .mfvalue          (vector) most frequent value
%      .mfcount          (vector) number of occurances of most frequent value
%      .values           (vector) at most MAXDISCRETE (see below) sample values
%      .counts           (vector) number of occurances for each sampled value
%      .tiles            (vector) NT-tile values, for example
%                                    NT=4   for quartiles: 25%, 50% and 75%
%                                    NT=100 for percentiles: 1%, 2%, ... and 99%
%      .hist             (struct) histogram struct with the following fields
%           .type        (string) 'som_hist'
%           .bins        (vector) histogram bin centers
%           .counts      (vector) count of values in each bin
%           .binlabels   (cellstr) labels for the bins (denormalized bin
%                                  center values)
%           .binlabels2  (cellstr) labels for the bins (denormalized bin
%                                  edge values, e.g. '[1.4,2.5['
%
%   Constants:
%      MAXDISCRETE = 10
%      NT          = 10
%

% Contributed to SOM Toolbox 2.0, December 31st, 2001 by Juha Vesanto
% Copyright (c) by Juha Vesanto
% http://www.cis.hut.fi/projects/somtoolbox/

% Version 2.0beta juuso 311201

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5
%% arguments

% default values
nosort      = 0;
nbins       = 10;
maxdiscrete = 20;
ntiles      = 10;

% first argument
if isstruct(D),
switch D.type,
case 'som_map',  cn = D.comp_names; sN = D.comp_norm; D = D.codebook;
case 'som_data', cn = D.comp_names; sN = D.comp_norm; D = D.data;
otherwise, error('Invalid first argument')
end
else
cn = cell(size(D,2),1);
cn(:) = {'Variable'};
for i=1:length(cn), cn{i} = sprintf('%s%d',cn{i},i); end
sN = cell(size(D,2),1);
end
[dlen dim] = size(D);

% other arguments

if length(varargin)>0,
if strcmp(varargin{1},'nosort'), nosort = 1; end
end

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5
%% action

sStat = struct('type','som_stat','name','','normalization',[],...
'min',NaN,'max',NaN,'mean',NaN,'std',NaN,...
'nunique',NaN,'values',[],'counts',[],'mfvalue',NaN,'mfcount',NaN,'tiles',[],...
'ntotal',dlen,'nvalid',NaN,'hist',[]);
csS = cell(0);

for i=1:dim,
sS = sStat;
sS.name = cn{i};
sS.normalization = sN{i};
x = D(:,i);
x(find(~isfinite(x))) = [];
% basic descriptive statistics
sS.nvalid = length(x);
if length(x),
sS.min  = min(x);
sS.max  = max(x);
sS.mean = mean(x);
sS.std = std(x);
bins = [];
if ~nosort,
xsorted    = sort(x);
% number of unique values
repeated   = (xsorted(1:end-1)==xsorted(2:end));
j          = [1; find(~repeated)+1];
xunique    = xsorted(j);
sS.nunique = length(xunique);
ucount     = diff([j; length(xsorted)+1]);
% most frequent value
[fcount,j] = max(ucount);
sS.mfvalue = xunique(j);
sS.mfcount = fcount;
% -tiles (k*100/ntiles % of values, k=1..)
pickind    = round(linspace(1,sS.nvalid,ntiles+1));
pickind    = pickind(2:end-1);
sS.tiles   = xsorted(pickind);
if sS.nunique <= sS.nvalid/2,
% unique values
sS.values = xunique;
sS.counts = ucount;
bins = sS.values;
else
% just maxdiscrete values, evenly  picked
pickind    = round(linspace(1,sS.nunique,maxdiscrete));
sS.values  = xunique(pickind);
sS.counts  = ucount(pickind);

%% OPTION 2: maxdiscrete most frequent values
%[v,j]     = sort(ucount);
%pickind   = j(1:maxdiscrete);
%sS.values = xunique(pickind);
%sS.counts = ucount(pickind);

% OPTION 3: representative values - calculated using k-means
%[y,bm,qe] = kmeans(x,maxdiscrete);
%sS.values = y;
%sS.counts = full(sum(sparse(bm,1:length(bm),1,maxdiscrete,length(bm)),2));
end
end
if isempty(bins),
bins = linspace(sS.min,sS.max,nbins+1);
bins = (bins(1:end-1)+bins(2:end))/2;
end
sS.hist = som_hist(x,bins,sS.normalization);
else
sS.hist = som_hist(x,0);
end
csS{end+1} = sS;
end

return;

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5
%% subfunctions

function sH = som_hist(x,bins,sN)

binlabels  = [];
binlabels2 = [];
if nargin<2 | isempty(bins) | isnan(bins),
bins = linspace(min(x),max(x),10);
end
if isstruct(bins),
bins = sH.bins;
binlabels  = sH.binlabels;
binlabels2 = sH.binlabels2;
end
if nargin<3, sN = []; end

sH = struct('type','som_hist','bins',bins,'counts',[],...
'binlabels',binlabels,'binlabels2',binlabels2);

if length(bins)==1,
sH.counts = [length(x)];
edges = bins;
elseif length(x),
edges = (bins(1:end-1)+bins(2:end))/2;
counts = histc(x,[-Inf; edges(:); Inf]);
sH.counts = counts(1:end-1);
end

if isempty(sH.binlabels),
b = som_denormalize(bins(:),sN);
sH.binlabels = numtostring(b,4);
end

if isempty(sH.binlabels2),
if length(edges)==1,
sH.binlabels2 = numtostring(som_denormalize(edges,sN),2);
if length(bins)>1,
sH.binlabels2 = sH.binlabels2([1 1]);
sH.binlabels2{1} = [']' sH.binlabels2{1} '['];
sH.binlabels2{2} = ['[' sH.binlabels2{2} '['];
end
else
if size(edges,1)==1, edges = edges'; end
bstr = numtostring(som_denormalize(edges,sN),4);
sH.binlabels2 = bstr([1:end end]);
sH.binlabels2{1} = [bstr{1} '['];
for i=2:length(sH.binlabels2)-1,
sH.binlabels2{i} = ['[' bstr{i-1} ',' bstr{i} '['];
end
sH.binlabels2{end} = ['[' bstr{end}];
end
end

if 0,
if length(bins)==1, sH.binlabels2 = {'constant'};
else
ntiles = 10;
plim = [1:ntiles-1] / ntiles;
cp = cumsum(sH.counts)/sum(sH.counts);
[dummy,i] = histc(cp,[-Inf plim Inf]);
l2 = cell(length(bins),1);
for j=1:length(bins), l2{j} = sprintf('Q%d',i(j)); end
if i(1) > 1, l2{1} = ['...' l2{1}]; end
k = 0;
for j=2:length(bins),
if i(j)==i(j-1),
if k==0, l2{j-1} = [l2{j-1} '.1']; k = 1; end
k = k + 1;
l2{j} = [l2{j} '.' num2str(k)];
else k = 0; end
end
if i(end) < ntiles, l2{end} = [l2{end} '...']; end
sH.binlabels2 = l2;
end
end

return;

function vstr = numtostring(v,d)

r = max(v)-min(v);
if r==0, r=1; end
nearzero = (abs(v)/r < 10.^-d);
i1 = find(v > 0 & nearzero);
i2 = find(v < 0 & nearzero);
vstr = strrep(cellstr(num2str(v,d)),' ','');
vstr(i1) = {'0.0'};
vstr(i2) = {'-0.0'};
return;

```