Browse code

starting som prediction fine-tuned class-performance visualisation

git-svn-id: https://svn.discofish.de/MATLAB/spmtoolbox/SVMCrossVal@112 83ab2cfd-5345-466c-8aeb-2b2739fb922d

Christoph Budziszewski authored on21/01/2009 16:34:25
Showing1 changed files
1 1
new file mode 100644
... ...
@@ -0,0 +1,257 @@
1
+function csS = som_stats(D,varargin)
2
+
3
+%SOM_STATS Calculate descriptive statistics for the data.
4
+%  
5
+% csS = som_stats(D,[sort]); 
6
+% 
7
+%  csS = som_stats(D); 
8
+%  csS = som_stats(D,'nosort'); 
9
+%  som_table_print(som_stats_table(csS))
10
+%
11
+%  Input and output arguments ([]'s are optional): 
12
+%   D           (matrix) a matrix, size dlen x dim
13
+%               (struct) data or map struct
14
+%   [sort]      (string) 'sort' (default) or 'nosort'
15
+%                        If 'nosort' is specified, the data is not 
16
+%                        sorted, and therefore the values of
17
+%                        nunique, uvalues, ucount, fvalues, fcount, and tiles fields 
18
+%                        are not calculated. This may be useful if
19
+%                        there is a very large amount of data, and
20
+%                        one wants to reduce calculation time.
21
+%
22
+%   csS         (cell array) size dim x 1, of statistics structs with 
23
+%                        the following fields
24
+%      .type             (string) 'som_stat'
25
+%      .name             (string) name of the variable
26
+%      .normalization    (struct array) variable normalization (see SOM_NORMALIZE)
27
+%      .ntotal           (scalar) total number of values
28
+%      .nvalid           (scalar) number of valid values (not Inf or NaN)
29
+%      .min              (scalar) minimum value 
30
+%      .max              (scalar) maximum value 
31
+%      .mean             (scalar) mean value (not Inf or NaN)
32
+%      .std              (scalar) standard deviation (not Inf or NaN)
33
+%      .nunique          (scalar) number of unique values
34
+%      .mfvalue          (vector) most frequent value
35
+%      .mfcount          (vector) number of occurances of most frequent value
36
+%      .values           (vector) at most MAXDISCRETE (see below) sample values 
37
+%      .counts           (vector) number of occurances for each sampled value
38
+%      .tiles            (vector) NT-tile values, for example
39
+%                                    NT=4   for quartiles: 25%, 50% and 75%
40
+%                                    NT=100 for percentiles: 1%, 2%, ... and 99%
41
+%      .hist             (struct) histogram struct with the following fields
42
+%           .type        (string) 'som_hist'
43
+%           .bins        (vector) histogram bin centers 
44
+%           .counts      (vector) count of values in each bin
45
+%           .binlabels   (cellstr) labels for the bins (denormalized bin
46
+%                                  center values)
47
+%           .binlabels2  (cellstr) labels for the bins (denormalized bin
48
+%                                  edge values, e.g. '[1.4,2.5['
49
+%
50
+%   Constants: 
51
+%      MAXDISCRETE = 10
52
+%      NT          = 10
53
+%
54
+% See also  SOM_STATS_PLOT, SOM_STATS_TABLE, SOM_TABLE_PRINT, SOM_STATS_REPORT.
55
+
56
+% Contributed to SOM Toolbox 2.0, December 31st, 2001 by Juha Vesanto
57
+% Copyright (c) by Juha Vesanto
58
+% http://www.cis.hut.fi/projects/somtoolbox/
59
+
60
+% Version 2.0beta juuso 311201
61
+
62
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5
63
+%% arguments
64
+
65
+% default values
66
+nosort      = 0; 
67
+nbins       = 10; 
68
+maxdiscrete = 20; 
69
+ntiles      = 10; 
70
+
71
+% first argument
72
+if isstruct(D), 
73
+    switch D.type, 
74
+    case 'som_map',  cn = D.comp_names; sN = D.comp_norm; D = D.codebook; 
75
+    case 'som_data', cn = D.comp_names; sN = D.comp_norm; D = D.data; 
76
+    otherwise, error('Invalid first argument')
77
+    end    
78
+else
79
+    cn = cell(size(D,2),1); 
80
+    cn(:) = {'Variable'};
81
+    for i=1:length(cn), cn{i} = sprintf('%s%d',cn{i},i); end    
82
+    sN = cell(size(D,2),1); 
83
+end
84
+[dlen dim] = size(D);
85
+
86
+% other arguments
87
+
88
+if length(varargin)>0, 
89
+  if strcmp(varargin{1},'nosort'), nosort = 1; end
90
+end
91
+
92
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5
93
+%% action
94
+
95
+sStat = struct('type','som_stat','name','','normalization',[],...
96
+               'min',NaN,'max',NaN,'mean',NaN,'std',NaN,...
97
+               'nunique',NaN,'values',[],'counts',[],'mfvalue',NaN,'mfcount',NaN,'tiles',[],...
98
+               'ntotal',dlen,'nvalid',NaN,'hist',[]);
99
+csS = cell(0);
100
+           
101
+for i=1:dim, 
102
+    sS = sStat;
103
+    sS.name = cn{i};
104
+    sS.normalization = sN{i}; 
105
+    x = D(:,i); 
106
+    x(find(~isfinite(x))) = [];
107
+    % basic descriptive statistics
108
+    sS.nvalid = length(x);
109
+    if length(x), 
110
+        sS.min  = min(x);
111
+        sS.max  = max(x);
112
+        sS.mean = mean(x);  
113
+        sS.std = std(x);
114
+        bins = [];
115
+        if ~nosort, 
116
+            xsorted    = sort(x);
117
+            % number of unique values
118
+            repeated   = (xsorted(1:end-1)==xsorted(2:end));
119
+            j          = [1; find(~repeated)+1];         
120
+            xunique    = xsorted(j); 
121
+            sS.nunique = length(xunique);           
122
+            ucount     = diff([j; length(xsorted)+1]);
123
+            % most frequent value
124
+            [fcount,j] = max(ucount);
125
+            sS.mfvalue = xunique(j);
126
+            sS.mfcount = fcount;
127
+            % -tiles (k*100/ntiles % of values, k=1..)
128
+            pickind    = round(linspace(1,sS.nvalid,ntiles+1)); 
129
+            pickind    = pickind(2:end-1);
130
+            sS.tiles   = xsorted(pickind);
131
+            if sS.nunique <= sS.nvalid/2, 
132
+                % unique values
133
+                sS.values = xunique; 
134
+	            sS.counts = ucount; 
135
+                bins = sS.values; 
136
+            else
137
+                % just maxdiscrete values, evenly  picked
138
+                pickind    = round(linspace(1,sS.nunique,maxdiscrete));
139
+                sS.values  = xunique(pickind);
140
+                sS.counts  = ucount(pickind);
141
+ 	    
142
+                %% OPTION 2: maxdiscrete most frequent values
143
+                %[v,j]     = sort(ucount); 
144
+                %pickind   = j(1:maxdiscrete);             
145
+                %sS.values = xunique(pickind);
146
+                %sS.counts = ucount(pickind);
147
+
148
+                % OPTION 3: representative values - calculated using k-means
149
+                %[y,bm,qe] = kmeans(x,maxdiscrete);
150
+               %sS.values = y; 
151
+                %sS.counts = full(sum(sparse(bm,1:length(bm),1,maxdiscrete,length(bm)),2));
152
+            end 
153
+        end 
154
+        if isempty(bins), 
155
+            bins = linspace(sS.min,sS.max,nbins+1); 
156
+            bins = (bins(1:end-1)+bins(2:end))/2; 
157
+        end
158
+        sS.hist = som_hist(x,bins,sS.normalization);    
159
+    else
160
+        sS.hist = som_hist(x,0);
161
+    end
162
+    csS{end+1} = sS; 
163
+end
164
+
165
+return;
166
+
167
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5
168
+%% subfunctions
169
+
170
+function sH = som_hist(x,bins,sN)
171
+
172
+    binlabels  = []; 
173
+    binlabels2 = []; 
174
+    if nargin<2 | isempty(bins) | isnan(bins), 
175
+        bins = linspace(min(x),max(x),10);    
176
+    end
177
+    if isstruct(bins), 
178
+        bins = sH.bins; 
179
+        binlabels  = sH.binlabels;
180
+        binlabels2 = sH.binlabels2;
181
+    end 
182
+    if nargin<3, sN = []; end
183
+
184
+    sH = struct('type','som_hist','bins',bins,'counts',[],...
185
+                'binlabels',binlabels,'binlabels2',binlabels2);                         
186
+            
187
+    if length(bins)==1,
188
+        sH.counts = [length(x)];
189
+        edges = bins;
190
+    elseif length(x),
191
+        edges = (bins(1:end-1)+bins(2:end))/2;
192
+        counts = histc(x,[-Inf; edges(:); Inf]);
193
+        sH.counts = counts(1:end-1);       
194
+    end 
195
+
196
+    if isempty(sH.binlabels),
197
+        b = som_denormalize(bins(:),sN); 
198
+        sH.binlabels = numtostring(b,4);
199
+    end 
200
+
201
+    if isempty(sH.binlabels2),
202
+        if length(edges)==1, 
203
+            sH.binlabels2 = numtostring(som_denormalize(edges,sN),2);
204
+            if length(bins)>1, 
205
+              sH.binlabels2 = sH.binlabels2([1 1]);
206
+              sH.binlabels2{1} = [']' sH.binlabels2{1} '['];
207
+              sH.binlabels2{2} = ['[' sH.binlabels2{2} '['];
208
+            end 
209
+        else
210
+            if size(edges,1)==1, edges = edges'; end
211
+            bstr = numtostring(som_denormalize(edges,sN),4);
212
+            sH.binlabels2 = bstr([1:end end]);
213
+            sH.binlabels2{1} = [bstr{1} '['];
214
+            for i=2:length(sH.binlabels2)-1,
215
+                sH.binlabels2{i} = ['[' bstr{i-1} ',' bstr{i} '[']; 
216
+            end 
217
+            sH.binlabels2{end} = ['[' bstr{end}];
218
+        end         
219
+    end 
220
+    
221
+    if 0, 
222
+        if length(bins)==1, sH.binlabels2 = {'constant'}; 
223
+        else    
224
+            ntiles = 10; 
225
+            plim = [1:ntiles-1] / ntiles; 
226
+            cp = cumsum(sH.counts)/sum(sH.counts);
227
+            [dummy,i] = histc(cp,[-Inf plim Inf]);            
228
+            l2 = cell(length(bins),1);            
229
+            for j=1:length(bins), l2{j} = sprintf('Q%d',i(j)); end
230
+            if i(1) > 1, l2{1} = ['...' l2{1}]; end            
231
+            k = 0; 
232
+            for j=2:length(bins), 
233
+                if i(j)==i(j-1), 
234
+                    if k==0, l2{j-1} = [l2{j-1} '.1']; k = 1; end
235
+                    k = k + 1; 
236
+                    l2{j} = [l2{j} '.' num2str(k)]; 
237
+                else k = 0; end
238
+            end 
239
+            if i(end) < ntiles, l2{end} = [l2{end} '...']; end
240
+            sH.binlabels2 = l2; 
241
+        end 
242
+    end    
243
+
244
+    return;
245
+
246
+function vstr = numtostring(v,d)
247
+
248
+    r = max(v)-min(v); 
249
+    if r==0, r=1; end
250
+    nearzero = (abs(v)/r < 10.^-d);
251
+    i1 = find(v > 0 & nearzero); 
252
+    i2 = find(v < 0 & nearzero);     
253
+    vstr = strrep(cellstr(num2str(v,d)),' ','');
254
+    vstr(i1) = {'0.0'};
255
+    vstr(i2) = {'-0.0'};
256
+    return;
257
+