somtoolbox2/som_stats.m
4dbef185
 function csS = som_stats(D,varargin)
 
 %SOM_STATS Calculate descriptive statistics for the data.
 %  
 % csS = som_stats(D,[sort]); 
 % 
 %  csS = som_stats(D); 
 %  csS = som_stats(D,'nosort'); 
 %  som_table_print(som_stats_table(csS))
 %
 %  Input and output arguments ([]'s are optional): 
 %   D           (matrix) a matrix, size dlen x dim
 %               (struct) data or map struct
 %   [sort]      (string) 'sort' (default) or 'nosort'
 %                        If 'nosort' is specified, the data is not 
 %                        sorted, and therefore the values of
 %                        nunique, uvalues, ucount, fvalues, fcount, and tiles fields 
 %                        are not calculated. This may be useful if
 %                        there is a very large amount of data, and
 %                        one wants to reduce calculation time.
 %
 %   csS         (cell array) size dim x 1, of statistics structs with 
 %                        the following fields
 %      .type             (string) 'som_stat'
 %      .name             (string) name of the variable
 %      .normalization    (struct array) variable normalization (see SOM_NORMALIZE)
 %      .ntotal           (scalar) total number of values
 %      .nvalid           (scalar) number of valid values (not Inf or NaN)
 %      .min              (scalar) minimum value 
 %      .max              (scalar) maximum value 
 %      .mean             (scalar) mean value (not Inf or NaN)
 %      .std              (scalar) standard deviation (not Inf or NaN)
 %      .nunique          (scalar) number of unique values
 %      .mfvalue          (vector) most frequent value
 %      .mfcount          (vector) number of occurances of most frequent value
 %      .values           (vector) at most MAXDISCRETE (see below) sample values 
 %      .counts           (vector) number of occurances for each sampled value
 %      .tiles            (vector) NT-tile values, for example
 %                                    NT=4   for quartiles: 25%, 50% and 75%
 %                                    NT=100 for percentiles: 1%, 2%, ... and 99%
 %      .hist             (struct) histogram struct with the following fields
 %           .type        (string) 'som_hist'
 %           .bins        (vector) histogram bin centers 
 %           .counts      (vector) count of values in each bin
 %           .binlabels   (cellstr) labels for the bins (denormalized bin
 %                                  center values)
 %           .binlabels2  (cellstr) labels for the bins (denormalized bin
 %                                  edge values, e.g. '[1.4,2.5['
 %
 %   Constants: 
 %      MAXDISCRETE = 10
 %      NT          = 10
 %
 % See also  SOM_STATS_PLOT, SOM_STATS_TABLE, SOM_TABLE_PRINT, SOM_STATS_REPORT.
 
 % Contributed to SOM Toolbox 2.0, December 31st, 2001 by Juha Vesanto
 % Copyright (c) by Juha Vesanto
 % http://www.cis.hut.fi/projects/somtoolbox/
 
 % Version 2.0beta juuso 311201
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5
 %% arguments
 
 % default values
 nosort      = 0; 
 nbins       = 10; 
 maxdiscrete = 20; 
 ntiles      = 10; 
 
 % first argument
 if isstruct(D), 
     switch D.type, 
     case 'som_map',  cn = D.comp_names; sN = D.comp_norm; D = D.codebook; 
     case 'som_data', cn = D.comp_names; sN = D.comp_norm; D = D.data; 
     otherwise, error('Invalid first argument')
     end    
 else
     cn = cell(size(D,2),1); 
     cn(:) = {'Variable'};
     for i=1:length(cn), cn{i} = sprintf('%s%d',cn{i},i); end    
     sN = cell(size(D,2),1); 
 end
 [dlen dim] = size(D);
 
 % other arguments
 
 if length(varargin)>0, 
   if strcmp(varargin{1},'nosort'), nosort = 1; end
 end
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5
 %% action
 
 sStat = struct('type','som_stat','name','','normalization',[],...
                'min',NaN,'max',NaN,'mean',NaN,'std',NaN,...
                'nunique',NaN,'values',[],'counts',[],'mfvalue',NaN,'mfcount',NaN,'tiles',[],...
                'ntotal',dlen,'nvalid',NaN,'hist',[]);
 csS = cell(0);
            
 for i=1:dim, 
     sS = sStat;
     sS.name = cn{i};
     sS.normalization = sN{i}; 
     x = D(:,i); 
     x(find(~isfinite(x))) = [];
     % basic descriptive statistics
     sS.nvalid = length(x);
     if length(x), 
         sS.min  = min(x);
         sS.max  = max(x);
         sS.mean = mean(x);  
         sS.std = std(x);
         bins = [];
         if ~nosort, 
             xsorted    = sort(x);
             % number of unique values
             repeated   = (xsorted(1:end-1)==xsorted(2:end));
             j          = [1; find(~repeated)+1];         
             xunique    = xsorted(j); 
             sS.nunique = length(xunique);           
             ucount     = diff([j; length(xsorted)+1]);
             % most frequent value
             [fcount,j] = max(ucount);
             sS.mfvalue = xunique(j);
             sS.mfcount = fcount;
             % -tiles (k*100/ntiles % of values, k=1..)
             pickind    = round(linspace(1,sS.nvalid,ntiles+1)); 
             pickind    = pickind(2:end-1);
             sS.tiles   = xsorted(pickind);
             if sS.nunique <= sS.nvalid/2, 
                 % unique values
                 sS.values = xunique; 
 	            sS.counts = ucount; 
                 bins = sS.values; 
             else
                 % just maxdiscrete values, evenly  picked
                 pickind    = round(linspace(1,sS.nunique,maxdiscrete));
                 sS.values  = xunique(pickind);
                 sS.counts  = ucount(pickind);
  	    
                 %% OPTION 2: maxdiscrete most frequent values
                 %[v,j]     = sort(ucount); 
                 %pickind   = j(1:maxdiscrete);             
                 %sS.values = xunique(pickind);
                 %sS.counts = ucount(pickind);
 
                 % OPTION 3: representative values - calculated using k-means
                 %[y,bm,qe] = kmeans(x,maxdiscrete);
                %sS.values = y; 
                 %sS.counts = full(sum(sparse(bm,1:length(bm),1,maxdiscrete,length(bm)),2));
             end 
         end 
         if isempty(bins), 
             bins = linspace(sS.min,sS.max,nbins+1); 
             bins = (bins(1:end-1)+bins(2:end))/2; 
         end
         sS.hist = som_hist(x,bins,sS.normalization);    
     else
         sS.hist = som_hist(x,0);
     end
     csS{end+1} = sS; 
 end
 
 return;
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5
 %% subfunctions
 
 function sH = som_hist(x,bins,sN)
 
     binlabels  = []; 
     binlabels2 = []; 
     if nargin<2 | isempty(bins) | isnan(bins), 
         bins = linspace(min(x),max(x),10);    
     end
     if isstruct(bins), 
         bins = sH.bins; 
         binlabels  = sH.binlabels;
         binlabels2 = sH.binlabels2;
     end 
     if nargin<3, sN = []; end
 
     sH = struct('type','som_hist','bins',bins,'counts',[],...
                 'binlabels',binlabels,'binlabels2',binlabels2);                         
             
     if length(bins)==1,
         sH.counts = [length(x)];
         edges = bins;
     elseif length(x),
         edges = (bins(1:end-1)+bins(2:end))/2;
         counts = histc(x,[-Inf; edges(:); Inf]);
         sH.counts = counts(1:end-1);       
     end 
 
     if isempty(sH.binlabels),
         b = som_denormalize(bins(:),sN); 
         sH.binlabels = numtostring(b,4);
     end 
 
     if isempty(sH.binlabels2),
         if length(edges)==1, 
             sH.binlabels2 = numtostring(som_denormalize(edges,sN),2);
             if length(bins)>1, 
               sH.binlabels2 = sH.binlabels2([1 1]);
               sH.binlabels2{1} = [']' sH.binlabels2{1} '['];
               sH.binlabels2{2} = ['[' sH.binlabels2{2} '['];
             end 
         else
             if size(edges,1)==1, edges = edges'; end
             bstr = numtostring(som_denormalize(edges,sN),4);
             sH.binlabels2 = bstr([1:end end]);
             sH.binlabels2{1} = [bstr{1} '['];
             for i=2:length(sH.binlabels2)-1,
                 sH.binlabels2{i} = ['[' bstr{i-1} ',' bstr{i} '[']; 
             end 
             sH.binlabels2{end} = ['[' bstr{end}];
         end         
     end 
     
     if 0, 
         if length(bins)==1, sH.binlabels2 = {'constant'}; 
         else    
             ntiles = 10; 
             plim = [1:ntiles-1] / ntiles; 
             cp = cumsum(sH.counts)/sum(sH.counts);
             [dummy,i] = histc(cp,[-Inf plim Inf]);            
             l2 = cell(length(bins),1);            
             for j=1:length(bins), l2{j} = sprintf('Q%d',i(j)); end
             if i(1) > 1, l2{1} = ['...' l2{1}]; end            
             k = 0; 
             for j=2:length(bins), 
                 if i(j)==i(j-1), 
                     if k==0, l2{j-1} = [l2{j-1} '.1']; k = 1; end
                     k = k + 1; 
                     l2{j} = [l2{j} '.' num2str(k)]; 
                 else k = 0; end
             end 
             if i(end) < ntiles, l2{end} = [l2{end} '...']; end
             sH.binlabels2 = l2; 
         end 
     end    
 
     return;
 
 function vstr = numtostring(v,d)
 
     r = max(v)-min(v); 
     if r==0, r=1; end
     nearzero = (abs(v)/r < 10.^-d);
     i1 = find(v > 0 & nearzero); 
     i2 = find(v < 0 & nearzero);     
     vstr = strrep(cellstr(num2str(v,d)),' ','');
     vstr(i1) = {'0.0'};
     vstr(i2) = {'-0.0'};
     return;