source: trunk/SRC/Matrix/cmset_op.pro @ 325

Last change on this file since 325 was 325, checked in by pinsard, 16 years ago

modification of some headers (+some corrections) to prepare usage of the new idldoc

  • Property svn:keywords set to Id
File size: 16.0 KB
Line 
1;+
2;
3; @hidden
4;
5; @file_comments
6; Simplified version of CMSET_OP_UNIQ which sorts, and takes the
7; "first" value, whatever that may mean.
8;
9; @todo seb
10;
11;-
12FUNCTION cmset_op_uniq, a
13;
14  compile_opt idl2, strictarrsubs
15;
16  if n_elements(a) LE 1 then return, 0L
17
18  ii = sort(a) & b = a[ii]
19  wh = where(b NE shift(b, +1L), ct)
20  if ct GT 0 then return, ii[wh]
21
22  return, 0L
23;
24end
25;+
26;
27; @file_comments
28; Performs an AND, OR, or XOR operation between two sets
29;
30; Description: SET_OP performs three common operations between two sets. The
31; three supported functions of OP are:
32;
33;        OP          Meaning
34;      'AND' - to find the intersection of A and B;
35;      'OR'  - to find the union of A and B;
36;      'XOR' - to find the those elements who are members of A or B
37;              but not both;
38;
39;   Sets as defined here is one dimensional array composed of
40;   numeric or string types. Comparisons of equality between elements
41;   are done using the IDL EQ operator.
42;
43;   The complements of either set can be taken as well, by using the
44;   NOT1 and NOT2 keywords. For example, it may be desirable to find
45;   the elements in A but not B, or B but not A (they are different!).
46;   The following IDL expressions achieve each of those effects:
47;
48;      SET = CMSET_OP(A, 'AND', /NOT2, B)   ; A but not B
49;      SET = CMSET_OP(/NOT1, A, 'AND', B)   ; B but not A
50;
51;   Note the distinction between NOT1 and NOT2.  NOT1 refers to the
52;   first set (A) and NOT2 refers to the second (B).  Their ordered
53;   placement in the calling sequence is entirely optional, but the
54;   above ordering makes the logical meaning explicit.
55;
56;   NOT1 and NOT2 can only be set for the 'AND' operator, and never
57;   simultaneously. This is because the results of an operation with
58;   'OR' or 'XOR' and any combination of NOTs -- or with 'AND' and
59;   both NOTs -- formally cannot produce a defined result.
60;
61;   The implementation depends on the type of operands. For integer
62;   types, a fast technique using HISTOGRAM is used. However, this
63;   algorithm becomes inefficient when the dynamic range in the data
64;   is large. For those cases, and for other data types, a technique
65;   based on SORT() is used. Thus the compute time should scale
66;   roughly as (A+B)*ALOG(A+B) or better, rather than (A*B) for the
67;   brute force approach. For large arrays this is a significant
68;   benefit.
69;
70; @categories
71; Array
72;
73; @param A {in}{required}
74; The two sets to be operated on. A one dimensional array of
75; either numeric or string type. A and B must be of the same
76; type. Empty sets are permitted, and are either represented
77; as an undefined variable, or by setting EMPTY1 or EMPTY2.
78;
79; @param B {in}{required}
80; See A
81;
82; @param OP0 {in}{required}{type=string}
83; a string, the operation to be performed. Must be one of
84; 'AND', 'OR' or 'XOR' (lower or mixed case is permitted).
85; Other operations will cause an error message to be produced.
86;
87; @keyword NOT1
88; If set and OP is 'AND', then the complement of A (for
89; NOT1) or B (for NOT2) will be used in the operation.
90; NOT1 and NOT2 cannot be set simultaneously.
91;
92; @keyword NOT2
93; See NOT1
94;
95; @keyword EMPTY1
96; If set, then A (for EMPTY1) or B (for EMPTY2) are
97; assumed to be the empty set. The actual values
98; passed as A or B are then ignored.
99;
100; @keyword EMPTY2
101; See EMPTY1
102;
103; @keyword INDEX
104; if set, then return a list of indexes instead of the array
105; values themselves. The "slower" set operations are always
106; performed in this case.
107;
108; The indexes refer to the *combined* array [A,B]. To
109; clarify, in the following call: I = CMSET_OP(..., /INDEX);
110; returned values from 0 to NA-1 refer to A[I], and values
111; from NA to NA+NB-1 refer to B[I-NA].
112;
113; @keyword COUNT
114; upon return, the number of elements in the result set.
115; This is only important when the result set is the empty
116; set, in which case COUNT is set to zero.
117;
118; @returns
119; The resulting set as a one-dimensional array. The set may be
120; represented by either an array of data values (default), or an
121; array of indexes (if INDEX is set). Duplicate elements, if any,
122; are removed, and element order may not be preserved.
123;
124; The empty set is represented as a return value of -1L, and COUNT
125; is set to zero. Note that the only way to recognize the empty set
126; is to examine COUNT.
127;
128; SEE ALSO:
129;
130;  SET_UTILS.PRO by RSI
131;
132; @history
133; Written, CM, 23 Feb 2000
134;   Added empty set capability, CM, 25 Feb 2000
135;   Documentation clarification, CM 02 Mar 2000
136;   Incompatible but more consistent reworking of EMPTY keywords, CM,
137;     04 Mar 2000
138;   Minor documentation clarifications, CM, 26 Mar 2000
139;   Corrected bug in empty_arg special case, CM 06 Apr 2000
140;   Add INDEX keyword, CM 31 Jul 2000
141;   Clarify INDEX keyword documentation, CM 06 Sep 2000
142;   Made INDEX keyword always force SLOW_SET_OP, CM 06 Sep 2000
143;   Added CMSET_OP_UNIQ, and ability to select FIRST_UNIQUE or
144;     LAST_UNIQUE values, CM, 18 Sep 2000
145;   Removed FIRST_UNIQUE and LAST_UNIQUE, and streamlined
146;     CMSET_OP_UNIQ until problems with SORT can be understood, CM, 20
147;     Sep 2000 (thanks to Ben Tupper)
148;   Still trying to get documentation of INDEX and NOT right, CM, 28
149;     Sep 2000 (no code changes)
150;   Correct bug for AND case, when input sets A and B each only have
151;     one unique value, and the values are equal.  CM, 04 Mar 2004
152;     (thanks to James B. jbattat at cfa dot harvard dot edu)
153;   Add support for the cases where the input data types are mixed,
154;      but still compatible; also, attempt to return the same data
155;      type that was passed in; CM, 05 Feb 2005
156;   Fix bug in type checking (thanks to "marit"), CM, 10 Dec 2005
157;   Work around a stupidity in the built-in IDL HISTOGRAM routine,
158;      which tries to "help" you by restricting the MIN/MAX to the
159;      range of the input variable (thanks to Will Maddox), CM, 16 Jan 2006
160;
161;   Author: Craig B. Markwardt, NASA/GSFC Code 662, Greenbelt, MD 20770
162;   craigm\@lheamail.gsfc.nasa.gov
163;
164; @version
165; $Id$
166;
167; @examples
168; Utility function, similar to UNIQ, but allowing choice of taking
169; first or last unique element, or non-unique elements.
170; Unfortunately this doesn't work because of implementation dependent
171; versions of the SORT() function.
172;
173; function cmset_op_uniq, a, first=first, non=non, count=ct, sort=sortit
174;   if n_elements(a) LE 1 then return, 0L
175;   sh = (2L*keyword_set(first)-1L)*(-2L*keyword_set(non)+1)
176;
177;   if keyword_set(sortit) then begin
178;       ;; Sort it manually
179;       ii = sort(a) & b = a[ii]
180;       if keyword_set(non) then wh = where(b EQ shift(b, sh), ct) $
181;       else                     wh = where(b NE shift(b, sh), ct)
182;       if ct GT 0 then return, ii[wh]
183;   endif else begin
184;       ;; Use the user's values directly
185;       if keyword_set(non) then wh = where(a EQ shift(a, sh), ct) $
186;       else                     wh = where(a NE shift(a, sh), ct)
187;       if ct GT 0 then return, wh
188;   endelse
189;
190;   if keyword_set(first) then return, 0L else return, n_elements(a)-1
191; end
192
193; Simplified version of CMSET_OP_UNIQ which sorts, and takes the
194; "first" value, whatever that may mean.
195;
196;-
197FUNCTION cmset_op, a, op0, b, NOT1=not1, NOT2=not2, COUNT=count, $
198              EMPTY1=empty1, EMPTY2=empty2, MAXARRAY=ma, INDEX=index
199;
200  compile_opt idl2, strictarrsubs
201;
202
203  on_error, 2 ;; return on error
204  count = 0L
205  index0 = -1L
206  ;; Histogram technique is used for array sizes < 32,000 elements
207  if n_elements(ma) EQ 0 then ma = 32L*1024L
208
209  ;; Check the number of arguments
210  if n_params() LT 3 then begin
211      ARG_ERR:
212      message, 'USAGE: SET = CMSET_OP(A, OP, B [, COUNT=ct])', /info
213      message, '  KEYWORDS: /NOT1, /NOT2, /EMPTY1, /EMPTY2, INDEX', /info
214      return, -1L
215  endif
216  if n_elements(op0) EQ 0 then goto, ARG_ERR
217  kind = keyword_set(index)
218  fst = 1L
219  if keyword_set(last) then fst = 0L
220  if keyword_set(first) then fst = 1L
221
222  ;; Check the operation
223  sz = size(op0)
224  if sz[sz[0]+1] NE 7 then begin
225      OP_ERR:
226      message, "ERROR: OP must be 'AND', 'OR' or 'XOR'"
227      return, -1L
228  endif
229  op = strupcase(op0)
230  if op NE 'AND' AND op NE 'OR' AND op NE 'XOR' then goto, OP_ERR
231
232  ;; Check NOT1 and NOT2
233  if keyword_set(not1) AND keyword_set(not2) then begin
234      message, "ERROR: NOT1 and NOT2 cannot be set simultaneously"
235      return, -1L
236  endif
237  if (keyword_set(not1) OR keyword_set(not2)) AND $
238    (op EQ 'OR' OR op EQ 'XOR') then begin
239      message, "ERROR: NOT1 and NOT2 cannot be set with 'OR' or 'XOR'"
240      return, -1L
241  endif
242
243  ;; Special cases for empty set
244  n1 = n_elements(a) & n2 = n_elements(b)
245  if keyword_set(empty1) then n1 = 0L
246  if keyword_set(empty2) then n2 = 0L
247  if n1 EQ 0 OR n2 EQ 0 then begin
248      ;; Eliminate duplicates
249      if n1 GT 0 then a1 = cmset_op_uniq(a)
250      if n2 GT 0 then b1 = cmset_op_uniq(b)
251      n1 = n_elements(a1) < n1 & n2 = n_elements(b1) < n2
252      case op of
253          'OR': if n1 EQ 0 then goto, RET_A1 else goto, RET_B1
254         'XOR': if n1 EQ 0 then goto, RET_B1 else goto, RET_A1
255         'AND': begin
256             if keyword_set(not1) AND n1 EQ 0 then goto, RET_B1
257             if keyword_set(not2) AND n2 EQ 0 then goto, RET_A1
258             return, -1L
259         end
260     endcase
261     return, -1L
262     RET_A1:
263     count = n1
264     if kind then begin
265         if count GT 0 then return, a1 else return, -1L
266     endif
267     if count GT 0 then return, a[a1] else return, -1L
268     RET_B1:
269     count = n2
270     if kind then begin
271         if count GT 0 then return, b1+n1 else return, -1L
272     endif
273     if count GT 0 then return, b[b1] else return, -1L
274 endif
275
276  ;; Allow data to have different types, but they must be at least of
277  ;; the same "base" type.  That is, you can't combine a number with a
278  ;; string, etc.
279  ;; basetype 0:undefined 1:real number 6:complex number 7:string
280  ;;     8:structure 10:pointer 11:object
281
282  ;;          0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
283  basetype = [0, 1, 1, 1, 1, 1, 6, 7, 8, 6,10,11, 1, 1, 1, 1]
284
285  ;; Check types of operands
286  sz1 = size(a) & tp1 = sz1[sz1[0]+1]
287  sz2 = size(b) & tp2 = sz2[sz2[0]+1]
288  if tp1 LT 0 OR tp1 GE 16 OR tp2 LT 0 OR tp2 GE 16 then begin
289      message, 'ERROR: unrecognized data types for operands'
290      return, -1
291  endif
292  if basetype[tp1] NE basetype[tp2] then begin
293      TYPE1_ERR:
294      message, 'ERROR: both A and B must be of the same type'
295      return, -1L
296  endif
297  if tp1 EQ 8 OR tp1 EQ 10 OR tp1 EQ 11 then begin
298      TYPE2_ERR:
299      message, 'ERROR: operands must be a numeric or string type'
300      return, -1L
301  endif
302
303  ;; Now use two different kinds of algorithms: a slower but more
304  ;; general algorithm for generic types, and the histogram technique
305  ;; for integer types.  Even for integer types, if there is too much
306  ;; dynamic range, then the slow method is used.
307
308  if tp1 GE 4 AND tp1 LE 9 then begin
309      ;; String and real types, or large int arrays
310      SLOW_SET_OP:
311      case op of
312          'OR': begin
313              uu = [a,b]    ;; OR is simple; just take unique values
314              index0 = cmset_op_uniq(uu)
315              count = n_elements(index0)
316              if kind then return, index0
317              return, uu[index0]
318          end
319
320          'XOR': begin
321              ;; Make ordered list of set union
322              ai = cmset_op_uniq(a) & na = n_elements(ai)
323              bi = cmset_op_uniq(b) & nb = n_elements(bi)
324              ui = [ai, bi+n1]
325              uu = [a,b]    & uu = uu[ui] ;; Raw union...
326              us = sort(uu) & uu = uu[us] ;; ...and sort
327              if kind then ui = ui[temporary(us)] else ui = 0
328
329              ;; Values in one set only will not have duplicates
330              wh1 = where(uu NE shift(uu, -1), count1)
331              if count1 EQ 0 then return, -1L
332              wh = where(wh1[1:*]-wh1 EQ 1, count)
333              if wh1[0] EQ 0 then begin
334                  if count GT 0 then wh = [-1L, wh] else wh = [-1L]
335                  count = n_elements(wh)
336              endif
337              if count EQ 0 then return, -1
338              if kind then return, ui[wh1[wh+1]]
339              return, uu[wh1[wh+1]]
340          end
341
342          'AND': begin
343              ;; Make ordered list of set union
344              ai = cmset_op_uniq(a) & na = n_elements(ai)
345              bi = cmset_op_uniq(b) & nb = n_elements(bi)
346              ui = [ai, bi+n1]
347              uu = [a,b]    & uu = uu[ui]  ;; Raw union...
348              us = sort(uu) & uu = uu[us]  ;; ...and sort
349              if kind then ui = ui[us] else ui = 0
350
351              if NOT keyword_set(not1) AND NOT keyword_set(not2) then begin
352
353                  ;; Special case: if there is one in each set, and
354                  ;; they are equal, then the SHIFT() technique below
355                  ;; fails.  Do this one by hand.
356                  if na EQ 1 AND nb EQ 1 AND uu[0] EQ uu[1] then begin
357                      count = 1L
358                      if kind then return, 0L
359                      return, [uu[0]]
360                  endif
361
362                  ;; If neither "NOT" is set, then find duplicates
363                  us = 0L  ;; Save memory
364                  wh = where(uu EQ shift(uu, -1L), count) ;; Find non-unique
365                  if count EQ 0 then return, -1L
366                  ;; This should always select the element from A
367                  ;; rather than B (the smaller of the two)
368                  if kind then return, (ui[wh] < ui[wh+1])
369                  return, uu[wh]
370              endif
371
372              ;; For "NOT" cases, we need to identify by set
373              ii = make_array(na+nb, value=1b)
374              if keyword_set(not1) then ii[0:na-1] = 0
375              if keyword_set(not2) then ii[na:*]   = 0
376              ii = ii[temporary(us)]
377
378              ;; Remove any duplicates
379              wh1 = where(uu EQ shift(uu, -1L), count1) ;; Find non-unique
380              if count1 GT 0 then ii[wh1, wh1+1] = 0
381              ;; Remainder is the desired set
382              wh = where(ii, count)
383              if count EQ 0 then return, -1L
384              if kind then return, ui[wh]
385              return, uu[wh]
386          end
387
388      endcase
389      return, -1L  ;; DEFAULT CASE
390  endif else begin
391
392      ;; INDEX keyword forces the "slow" operation
393      if kind then goto, SLOW_SET_OP
394
395      ;; Integer types - use histogram technique if the data range
396      ;; is small enough, otherwise use the "slow" technique above
397      min1 = min(a, max=max1) & min2 = min(b, max=max2)
398      minn = min1 < min2 & maxx = max1 > max2
399      nbins = maxx-minn+1
400      if (maxx-minn) GT floor(ma[0]) then goto, SLOW_SET_OP
401
402      ;; Work around a stupidity in the built-in IDL HISTOGRAM routine
403      if (tp1 EQ 2 OR tp2 EQ 2) AND (minn LT -32768 OR maxx GT 32767) then $
404        goto, SLOW_SET_OP
405
406      ;; Following operations create a histogram of the integer values.
407      ha = histogram(a, min=minn, max=maxx) < 1
408      hb = histogram(b, min=minn, max=maxx) < 1
409
410      ;; Compute NOT cases
411      if keyword_set(not1) then ha = 1b - ha
412      if keyword_set(not2) then hb = 1b - hb
413      case op of
414          ;; Boolean operations
415          'AND': mask = temporary(ha) AND temporary(hb)
416           'OR': mask = temporary(ha)  OR temporary(hb)
417          'XOR': mask = temporary(ha) XOR temporary(hb)
418      endcase
419
420      wh = where(temporary(mask), count)
421      if count EQ 0 then return, -1L
422
423      result = temporary(wh+minn)
424      if tp1 NE tp2 then return, result
425      szr = size(result) & tpr = szr[szr[0]+1]
426
427      ;; Cast to the original type if necessary
428      if tpr NE tp1 then begin
429          fresult = make_array(n_elements(result), type=tp1)
430          fresult[0] = temporary(result)
431          result = temporary(fresult)
432      endif
433
434      return, result
435
436  endelse
437
438  return, -1L  ;; DEFAULT CASE
439end
440
441;     Here is how I did the INDEX stuff with fast histogramming.  It
442;     works, but is complicated, so I forced it to go to SLOW_SET_OP.
443;     ha = histogram(a, min=minn, max=maxx, reverse=ra) < 1
444;     rr = ra[0:nbins] & mask = rr NE rr[1:*] & ra = ra[rr]*mask-1L+mask
445;     hb = histogram(b, min=minn, max=maxx, reverse=rb) < 1
446;     rr = rb[0:nbins] & mask = rr NE rr[1:*] & rb = rb[rr]*mask-1L+mask
447;     ...  AND/OR/XOR NOT masking here ...
448;     ra = ra[wh] & rb = rb[wh]
449;     return, ra*(ra GE 0) + (rb+n1)*(ra LT 0) ;; is last 'ra' right?
450
Note: See TracBrowser for help on using the repository browser.