KMR
flexdice.c
1 /* flexdice.c (2014-02-04) -*-Coding: us-ascii;-*- */
2 /* Copyright (C) 1901-2014 Tomotake Nakamura */
3 
4 /* "FlexDice" - a clustering method which groups data objects in an
5  adjacent dense data space. Code is rewritten for KMR example. All
6  bugs are imputed to rewriters. */
7 
8 /* REFERENCES:
9  (1) "A Clustering Method using an Irregular Size Cell Graph",
10  RIDE-SDMA 2005, IEEE, 2005.
11  http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=1498227
12  (2) "FlexDice: A Fast Clustering Method for Large High Dimensional
13  Data Sets", IPSJ, 2005 (in Japanese).
14  http://ci.nii.ac.jp/naid/110002977727. */
15 
16 #include <stdio.h>
17 #include <stdlib.h>
18 #include <limits.h>
19 #include <float.h>
20 #include <math.h>
21 #include <string.h>
22 #include <sys/resource.h>
23 #include <errno.h>
24 #include <assert.h>
25 #include "flexdice.h"
26 
27 struct FLEXDICE fxd;
28 
29 static const int INT_BITS = (8 * sizeof(int));
30 
31 static void divide_cell_and_objects(struct CELL *oya);
32 static void set_direct_adjacency(struct CELL *cell);
33 static void undo_divide_cell(struct CELL *cell);
34 static void set_cell_range(struct CELL *cell, struct CELL *oya);
35 static void make_clusters(void);
36 static void finish_clusters(void);
37 
38 static void calculate_cell_path(unsigned int *, struct CELL *,
39  struct OBJECT *);
40 static void merge_clusters(struct CLUSTER *x0, struct CLUSTER *y0);
41 static void link_cells_to_cluster(struct CLUSTER *cluster, struct CELL *cell);
42 
43 static void setup_flexdice(struct CELL *top);
44 static void print_depth_limit(void);
45 static void output_statistics(void);
46 static double my_clock(void);
47 
48 #define BITSET(V,I) ((V)[(I)/INT_BITS] |= (1U << ((I)%INT_BITS)))
49 #define BITXOR(V,I) ((V)[(I)/INT_BITS] ^= (1U << ((I)%INT_BITS)))
50 #define BITGET(V,I) (((V)[(I)/INT_BITS] >> ((I)%INT_BITS)) & 1)
51 #define MIN(a,b) (((a)<(b))?(a):(b))
52 #define MAX(a,b) (((a)>(b))?(a):(b))
53 
54 static void *
55 malloc_safe(size_t s)
56 {
57  void *p = malloc(s);
58  if (p == NULL) {
59  fprintf(stderr, "Insufficient Memory.\n");
60  exit(1);
61  }
62  memset(p, 0, s);
63  return p;
64 }
65 
66 static bool
67 bits_equal(unsigned int *a, unsigned int *b)
68 {
69  for (int i = 0; i < fxd.CODE_SIZE; i++) {
70  if (a[i] != b[i])
71  return false;
72  }
73  return true;
74 }
75 
76 static void
77 bits_clear(unsigned int *a)
78 {
79  for (int i = 0; i < fxd.CODE_SIZE; i++) {
80  a[i] = 0;
81  }
82 }
83 
84 static void
85 bits_copy(unsigned int *a, unsigned int *b)
86 {
87  for (int i = 0; i < fxd.CODE_SIZE; i++) {
88  a[i] = b[i];
89  }
90 }
91 
92 static int
93 hash_bits(unsigned int *p, int m, struct INPARA *para)
94 {
95  return (p[0] % m);
96 }
97 
98 double
99 my_clock(void)
100 {
101  struct rusage ru;
102  int cc = getrusage(RUSAGE_SELF, &ru);
103  assert(cc == 0);
104  return (ru.ru_utime.tv_sec + (double)ru.ru_utime.tv_usec * 1e-6);
105 }
106 
107 static int
108 log2i(int n)
109 {
110  int i = 0;
111  int v = n;
112  while (v > 1) {
113  v /= 2;
114  i++;
115  }
116  return i;
117 }
118 
119 /* Merges the second object-list to the first, and empties the
120  second. */
121 
122 static void
123 relink_object_lists(struct OBJECTQ *dst, struct OBJECTQ *src)
124 {
125  if (src->head != NULL) {
126  assert(src->tail != NULL && src->count != 0);
127  src->tail->next = dst->head;
128  dst->head = src->head;
129  if (dst->tail == NULL) {
130  assert(dst->count == 0);
131  dst->tail = src->tail;
132  }
133  dst->count += src->count;
134  src->count = 0;
135  //src->total_count = 0;
136  src->head = NULL;
137  src->tail = NULL;
138  }
139 }
140 
141 static void
142 setup_object(struct OBJECT *o, int id)
143 {
144  o->next = NULL;
145  o->value = malloc_safe(sizeof(DATATYPE) * fxd.DIM);
146  o->id = id;
147 }
148 
149 static void
150 setup_celllist(struct CELLS *cs)
151 {
152  cs->count = 0;
153  cs->head = NULL;
154 }
155 
156 static void
157 setup_object_queue(struct OBJECTQ *data)
158 {
159  data->count = 0;
160  //data->total_count = 0;
161  data->head = NULL;
162  data->tail = NULL;
163 }
164 
165 static void
166 put_cluster(struct CLUSTER *c)
167 {
168  c->v_next = fxd.clusters;
169  fxd.clusters = c;
170  fxd.cluster_count++;
171 }
172 
173 static void
174 setup_cluster(struct CLUSTER *cluster, int id)
175 {
176  //setup_object_queue(&cluster->objects);
177  cluster->v_next = NULL;
178  cluster->n_next = cluster;
179  cluster->id = id;
180  cluster->object_count = 0;
181  cluster->active = true;
182  put_cluster(cluster);
183 }
184 
185 static void
186 setup_layer(struct LAYER *layer)
187 {
188  layer->object_count = 0;
189  layer->cell_count = 0;
190  layer->child_cell_count = 0;
191 }
192 
193 /* Takes an object in a cell, or returns null. */
194 
195 static struct OBJECT *
196 take_object_from_cell(struct CELL *cell)
197 {
198  struct OBJECT *o = cell->objects.head;
199  if (o != NULL) {
200  assert(cell->objects.count > 0);
201  cell->objects.head = cell->objects.head->next;
202  if (cell->objects.head == NULL) {
203  assert(cell->objects.count == 1);
204  cell->objects.tail = NULL;
205  }
206  cell->objects.count--;
207  o->next = NULL;
208  }
209  return o;
210 }
211 
212 static void
213 update_layer_stat(struct LAYER *layer, struct CELL *c)
214 {
215  layer->cell_count++;
216  layer->child_cell_count += c->child_count;
217  layer->object_count += c->total_object_count;
218 }
219 
220 static void
221 setup_cell_queue(struct CELLQ *q)
222 {
223  q->head = NULL;
224  q->tail = NULL;
225  q->count = 0;
226 }
227 
228 /* Puts a cell to the FIFO queue. */
229 
230 static void
231 put_cell_in_queue(struct CELL *c)
232 {
233  assert(c->q_next == NULL);
234  if (fxd.queue.count == 0) {
235  assert(fxd.queue.head == NULL);
236  fxd.queue.head = c;
237  } else {
238  fxd.queue.tail->q_next = c;
239  }
240  fxd.queue.tail = c;
241  fxd.queue.count++;
242 }
243 
244 /* Takes a cell from the FIFO queue. */
245 
246 static struct CELL *
247 take_cell_from_queue(void)
248 {
249  assert(fxd.queue.count != 0);
250  struct CELL *c = fxd.queue.head;
251  fxd.queue.head = c->q_next;
252  fxd.queue.count--;
253  return c;
254 }
255 
256 static void
257 link_cell_list(struct CELLS *h, struct CELL *cell)
258 {
259  cell->d_next = h->head;
260  h->head = cell;
261  h->count++;
262 }
263 
264 static void
265 put_object_in_cell(struct OBJECTQ *data, struct OBJECT *o)
266 {
267  assert(o->next == NULL);
268  if (data->head == NULL) {
269  assert(data->tail == NULL && data->count == 0);
270  data->head = o;
271  } else {
272  assert(data->tail != NULL && data->count != 0);
273  data->tail->next = o;
274  }
275  data->tail = o;
276  data->count++;
277 }
278 
279 static void
280 put_cell_in_cluster(struct CLUSTER *cluster, struct CELL *p)
281 {
282  assert(p->cluster == NULL && p->x_next == NULL);
283  p->cluster = cluster;
284  p->x_next = cluster->cells;
285  cluster->cells = p;
286 }
287 
288 static struct CELLS *
289 make_hashtable()
290 {
291  int hashsize = fxd.para->hashsize;
292  struct CELLS *h = malloc_safe(sizeof(struct CELLS) * hashsize);
293  for (int i = 0; i < hashsize; i++) {
294  setup_celllist(&h[i]);
295  }
296  return h;
297 }
298 
299 /* Finds an adjacent cell along with the given relative PATH of the
300  attributes. It returns NULL if nothing found. */
301 
302 static struct CELL *
303 find_child(struct CELL *oya, unsigned int *path)
304 {
305  int hv = hash_bits(path, fxd.para->hashsize, fxd.para);
306  struct CELLS *hashbin = &(oya->hashed_children[hv]);
307  struct CELL *cell;
308  for (cell = hashbin->head; cell != NULL; cell = cell->d_next) {
309  assert(cell->density != DEBRIS);
310  if (bits_equal(path, cell->path)) {
311  break;
312  }
313  }
314  return cell;
315 }
316 
317 static void
318 put_child(struct CELL *oya, unsigned int *path, struct CELL *cell)
319 {
320  int hv = hash_bits(path, fxd.para->hashsize, fxd.para);
321  struct CELLS *hashbin = &(oya->hashed_children[hv]);
322  link_cell_list(hashbin, cell);
323  cell->c_next = oya->children;
324  oya->children = cell;
325  oya->child_count++;
326 }
327 
328 static void
329 setup_cell(struct CELL *cell, unsigned int *path)
330 {
331  cell->c_next = NULL;
332  cell->q_next = NULL;
333  cell->d_next = NULL;
334  cell->x_next = NULL;
335  cell->level = 0;
336  cell->density = MADA;
337  cell->child_count = 0;
338  cell->total_object_count = 0;
339  cell->parent = NULL;
340  cell->path = malloc_safe(sizeof(unsigned int) * fxd.CODE_SIZE);
341  if (path == NULL) {
342  bits_clear(cell->path);
343  } else {
344  bits_copy(cell->path, path);
345  }
346  cell->cluster = NULL;
347  cell->children = NULL;
348  cell->hashed_children = NULL;
349  cell->adjacents = malloc_safe(sizeof(struct CELL *) * 2 * fxd.DIM);
350  for (int dir = 0; dir < fxd.DIM_UPDN; dir++) {
351  cell->adjacents[dir] = NULL;
352  }
353  cell->min = malloc_safe(sizeof(DATATYPE) * fxd.DIM);
354  cell->max = malloc_safe(sizeof(DATATYPE) * fxd.DIM);
355  cell->cen = malloc_safe(sizeof(DATATYPE) * fxd.DIM);
356  for (int attr = 0; attr < fxd.DIM; attr++) {
357  cell->min[attr] = 0;
358  cell->max[attr] = 0;
359  cell->cen[attr] = 0;
360  }
361  setup_object_queue(&cell->objects);
362 }
363 
364 /* Makes a cell for PATH for the PARENT. PARENT and PATH can be
365  NULL. */
366 
367 struct CELL *
368 create_cell(struct CELL *oya, unsigned int *path)
369 {
370  struct CELL *cell = malloc_safe(sizeof(struct CELL));
371  setup_cell(cell, path);
372  fxd.n_cells++;
373  if (oya != NULL) {
374  cell->parent = oya;
375  cell->level = (oya->level + 1);
376  bits_copy(cell->path, path);
377  put_child(oya, path, cell);
378  }
379  return cell;
380 }
381 
382 static void
383 put_cell_to_dense_list(struct CELL *c)
384 {
385  assert(c->d_next == NULL);
386  c->d_next = fxd.dense_cells;
387  fxd.dense_cells = c;
388  fxd.n_dense_cells++;
389 }
390 
391 static void
392 put_cell_to_noise_list(struct CELL *c)
393 {
394  assert(c->d_next == NULL);
395  c->d_next = fxd.noise_cells;
396  fxd.noise_cells = c;
397  fxd.n_noise_cells++;
398  fxd.n_noise_objects += c->objects.count;
399 }
400 
401 /* Runs FlexDice. Call it after "init_flexdice" and "read_input". */
402 
403 int
404 flexdice(struct CELL *top, struct INPARA *para)
405 {
406 #if 0
407  init_flexdice(argv, para);
408  print_parameters(para);
409  struct CELL *top = create_cell(NULL, NULL);
410  read_input(top, para);
411 #endif
412 
413  top->density = MADA;
414  setup_flexdice(top);
415 
416  print_depth_limit();
417 
418  fxd.t[0] = my_clock();
419 
420  /* Move initial cells to the processing queue. */
421 
422  put_cell_in_queue(top);
423 
424  /* Phase 1: Start. */
425 
426  int level = 0;
427  while (fxd.queue.count != 0) {
428  int count = 0;
429  for (struct CELL *p = fxd.queue.head; p != NULL; p = p->q_next) {
430  count++;
431  }
432  assert(count == fxd.queue.count);
433  assert(fxd.layers[level].cell_count == 0);
434 
435  printf("layer-info: "
436  "level=%d #cells-at-level=%ld #dense-cells=%ld\n",
437  level, fxd.queue.count, fxd.n_dense_cells);
438 
439  for (struct CELL *p = fxd.queue.head; p != NULL; p = p->q_next) {
440  assert((p->level == level) && (p->density == MADA));
441  if (p->objects.count < fxd.para->dmin) {
442  p->density = SPARSE;
443  } else if (level == (fxd.para->nlayers - 1)) {
444  p->density = DENSE;
445  } else {
446  p->density = MIDDLE;
447  divide_cell_and_objects(p);
448  }
449  update_layer_stat(&fxd.layers[level], p);
450  }
451  assert(fxd.layers[level].cell_count == fxd.queue.count);
452 
453  /* Work after all siblings at the level are divided. */
454 
455  assert(fxd.layers[level].cell_count != 0);
456  double ratio = (1.0 * fxd.layers[level].child_cell_count
457  / fxd.layers[level].cell_count);
458  int threshold = (int)(fxd.para->dfac * ratio);
459 
460  int n = fxd.queue.count;
461  for (int i = 0; i < n; i++) {
462  struct CELL *p = take_cell_from_queue();
463  assert(p->level == level);
464  set_direct_adjacency(p);
465  if (p->child_count > threshold) {
466  p->density = DENSE;
467  undo_divide_cell(p);
468  }
469  assert(p->density != MADA);
470  if (p->density == SPARSE) {
471  put_cell_to_noise_list(p);
472  } else if (p->density == DENSE) {
473  put_cell_to_dense_list(p);
474  } else {
475  assert(p->density == MIDDLE);
476  fxd.n_middle_cells++;
477  for (struct CELL *c = p->children; c != NULL; c = c->c_next) {
478  put_cell_in_queue(c);
479  }
480  }
481  }
482 
483  level++;
484  }
485 
486  /* Phase 1: End. */
487 
488  /* Check the number of objects. */
489 
490  {
491  int no = 0;
492  int nc = 0;
493  for (struct CELL *c = fxd.noise_cells; c != NULL; c = c->d_next) {
494  assert(c->objects.count != 0 && c->child_count == 0);
495  no += c->objects.count;
496  nc++;
497  }
498  int oo = 0;
499  int dc = 0;
500  for (struct CELL *c = fxd.dense_cells; c != NULL; c = c->d_next) {
501  assert(c->density == DENSE);
502  assert(c->objects.count != 0);
503  oo += c->objects.count;
504  dc++;
505  }
506  assert(fxd.n_objects == (oo + no));
507  assert(fxd.n_noise_objects == no);
508  assert(fxd.n_noise_cells == nc);
509  assert(fxd.n_dense_cells == dc);
510  assert(fxd.n_cells == (fxd.n_middle_cells + fxd.n_debris_cells
511  + nc + dc));
512  }
513 
514  fxd.t[1] = my_clock();
515 
516  printf("result: " "#dense-cells=%ld #noise-objects=%ld\n",
517  fxd.n_dense_cells, fxd.n_noise_objects);
518 
519  if (fxd.dense_cells == NULL) {
520  fprintf(stderr, "All objects are noise.\n");
521  exit(1);
522  }
523 
524  /* Phase 2: Start. */
525 
526  /* Construct subclusters. */
527 
528  make_clusters();
529  finish_clusters();
530 
531  /* Phase 2: End. */
532 
533  /* Check the number of objects. */
534 
535  {
536  int oo = 0;
537  int dc = 0;
538  for (struct CLUSTER *p = fxd.clusters; p != NULL; p = p->v_next) {
539  if (!p->active) {
540  continue;
541  }
542  oo += p->object_count;
543  {
544  struct CLUSTER *q = p;
545  do {
546  for (struct CELL *c = q->cells; c != NULL; c = c->x_next) {
547  if (c->density == DENSE) {
548  dc++;
549  }
550  }
551  q = q->n_next;
552  } while (q != p);
553  }
554  }
555  assert(fxd.n_objects == (oo + fxd.n_noise_objects));
556  assert(fxd.n_dense_cells == dc);
557  }
558 
559  fxd.t[2] = my_clock();
560 
561  return 0;
562 }
563 
564 static void
565 delete_cell(struct CELL *c)
566 {
567  free(c->path);
568  free(c->min);
569  free(c->max);
570  free(c->cen);
571  free(c->adjacents);
572  //free(c->children);
573  if (c->hashed_children != NULL) {
574  free(c->hashed_children);
575  }
576 }
577 
578 /*NOTYET*/
579 
580 static void
581 delete_flexdice()
582 {
583  free(fxd.depth_limits);
584  free(fxd.layers);
585  //free(fxd.noise_cells);
586 }
587 
588 void
589 init_flexdice(int argc, char **argv, struct INPARA *para)
590 {
591  fxd.para = para;
592  fxd.DIM = fxd.para->dim;
593  fxd.DIM_UPDN = (2 * fxd.para->dim);
594  fxd.CODE_SIZE = ((fxd.para->dim - 1) / INT_BITS) + 1;
595 }
596 
597 void
598 fin_flexdice(void)
599 {
600  delete_flexdice();
601 }
602 
603 void
604 read_input(struct CELL *input, struct INPARA *para)
605 {
606  int cc;
607  DATATYPE max[fxd.DIM];
608  DATATYPE min[fxd.DIM];
609  input->level = 0;
610  input->parent = NULL;
611  for (int dir = 0; dir < fxd.DIM_UPDN; dir++) {
612  input->adjacents[dir] = NULL;
613  }
614  for (int attr = 0; attr < fxd.DIM; attr++) {
615 #if (DATA == DATA_Z)
616  max[attr] = INT_MIN;
617  min[attr] = INT_MAX;
618 #else
619  max[attr] = -DBL_MAX;
620  min[attr] = DBL_MAX;
621 #endif
622  }
623 
624  FILE *f = fopen(para->infile, "r");
625  assert(f != NULL);
626  int nobjects = 0;
627 
628  {
629  struct OBJECT *o = NULL;
630  int attr = 0;
631  DATATYPE val;
632  errno = 0;
633 #if (DATA == DATA_Z)
634  #define FMT "%d"
635 #else
636  #define FMT "%lf"
637 #endif
638  for (;;) {
639  cc = fscanf(f, FMT, &val);
640  if (cc == EOF) {
641  break;
642  }
643  if (cc == 0) {
644 #define BAD_DATA_IN_INPUT 0
645  assert(BAD_DATA_IN_INPUT);
646  }
647  if (attr == 0) {
648  o = malloc_safe(sizeof(struct OBJECT));
649  setup_object(o, nobjects);
650  nobjects++;
651  }
652  o->value[attr] = val;
653  max[attr] = MAX(max[attr], val);
654  min[attr] = MIN(min[attr], val);
655  if (attr == (fxd.DIM - 1)) {
656  put_object_in_cell(&(input->objects), o);
657  attr = 0;
658  } else {
659  attr++;
660  }
661  }
662  assert(errno == 0);
663  }
664  cc = fclose(f);
665  assert(cc == 0);
666 
667  assert(input->objects.count == nobjects);
668  for (int attr = 0; attr < fxd.DIM; attr++) {
669  input->max[attr] = max[attr];
670  input->min[attr] = min[attr];
671  input->cen[attr] = (min[attr] + (max[attr] - min[attr]) / 2);
672  }
673 }
674 
675 /* Sets input with integer data DATA[COUNT][DIM]. */
676 
677 void
678 set_input(struct CELL *input, struct INPARA *para, int *data, long count)
679 {
680  assert(DATA == DATA_Z);
681  int max[fxd.DIM];
682  int min[fxd.DIM];
683  input->level = 0;
684  input->parent = NULL;
685  for (int dir = 0; dir < fxd.DIM_UPDN; dir++) {
686  input->adjacents[dir] = NULL;
687  }
688  for (int attr = 0; attr < fxd.DIM; attr++) {
689  max[attr] = INT_MIN;
690  min[attr] = INT_MAX;
691  }
692  for (long i = 0; i < count; i++) {
693  struct OBJECT *o = malloc_safe(sizeof(struct OBJECT));
694  setup_object(o, i);
695  for (int attr = 0; attr < fxd.DIM; attr++) {
696  int val = data[(fxd.DIM * i) + attr];
697  o->value[attr] = val;
698  max[attr] = MAX(max[attr], val);
699  min[attr] = MIN(min[attr], val);
700  }
701  put_object_in_cell(&(input->objects), o);
702  }
703  for (int attr = 0; attr < fxd.DIM; attr++) {
704  assert(max[attr] != INT_MIN);
705  assert(min[attr] != INT_MAX);
706  }
707  for (int attr = 0; attr < fxd.DIM; attr++) {
708  input->max[attr] = max[attr];
709  input->min[attr] = min[attr];
710  input->cen[attr] = (min[attr] + (max[attr] - min[attr]) / 2);
711  }
712 }
713 
714 /* Makes files ("datamax" and "datamin") holding min. and max. of
715  the input data. */
716 
717 void
718 output_statistics(void)
719 {
720  int cc;
721  char name[100];
722 
723  cc = snprintf(name, sizeof(name), "%s/datamax", fxd.para->outdir);
724  assert(cc < (int)sizeof(name));
725  FILE *f0 = fopen(name, "w");
726  assert(f0 != 0);
727 
728  struct CELL *top = fxd.top;
729  for (int attr = 0; attr < fxd.DIM; attr++) {
730 #if (DATA == DATA_Z)
731  cc = fprintf(f0, "%s%d", (attr == 0 ? "" : " "), top->max[attr]);
732  assert(cc >= 0);
733 #else
734  cc = fprintf(f0, "%s%f", (attr == 0 ? "" : " "), top->max[attr]);
735  assert(cc >= 0);
736 #endif
737  }
738  cc = fprintf(f0, "\n");
739  assert(cc >= 0);
740  cc = fclose(f0);
741  assert(cc == 0);
742 
743  cc = snprintf(name, sizeof(name), "%s/datamin", fxd.para->outdir);
744  assert(cc < (int)sizeof(name));
745  FILE *f1 = fopen(name, "w");
746  assert(f1 != 0);
747  for (int attr = 0; attr < fxd.DIM; attr++) {
748 #if (DATA == DATA_Z)
749  cc = fprintf(f1, "%s%d", (attr == 0 ? "" : " "), top->min[attr]);
750  assert(cc >= 0);
751 #else
752  cc = fprintf(f1, "%s%f", (attr == 0 ? "" : " "), top->min[attr]);
753  assert(cc >= 0);
754 #endif
755  }
756  cc = fprintf(f1, "\n");
757  assert(cc >= 0);
758  cc = fclose(f1);
759  assert(cc == 0);
760 }
761 
762 static void
763 output_object(FILE *f, struct OBJECT *o)
764 {
765  int cc;
766  cc = fprintf(f, "%d ", o->id);
767  assert(cc >= 0);
768  for (int attr = 0; attr < fxd.DIM; attr++) {
769 #if (DATA == DATA_Z)
770  cc = fprintf(f, " %d", o->value[attr]);
771  assert(cc >= 0);
772 #else
773  cc = fprintf(f, " %f", o->value[attr]);
774  assert(cc >= 0);
775 #endif
776  }
777  cc = fprintf(f, "\n");
778  assert(cc >= 0);
779 }
780 
781 void
782 output_clusters(void)
783 {
784  int cc;
785  char name[50];
786  char command[2048];
787 
788  int digits = ((int)log10((double)fxd.n_clusters) + 1);
789 
790  cc = snprintf(command, sizeof(command), "mkdir -p %s", fxd.para->outdir);
791  assert(cc < (int)sizeof(command));
792  cc = system(command);
793  assert(cc != -1);
794 
795  output_statistics();
796 
797  int count = 0;
798  for (struct CLUSTER *p = fxd.clusters; p != NULL; p = p->v_next) {
799  if (!p->active) {
800  continue;
801  }
802  cc = snprintf(name, sizeof(name), "%s/C%0*d",
803  fxd.para->outdir, digits, count);
804  assert(cc < (int)sizeof(name));
805  FILE *f0 = fopen(name, "w");
806  assert(f0 != 0);
807  for (struct CELL *c = p->cells; c != NULL; c = c->x_next) {
808  for (struct OBJECT *o = c->objects.head; o != NULL; o = o->next) {
809  output_object(f0, o);
810  }
811  }
812  cc = fclose(f0);
813  assert(cc == 0);
814  count++;
815  }
816  assert(count == fxd.n_clusters);
817 
818  _Bool noise[fxd.para->nlayers];
819  for (int i = 0; i < fxd.para->nlayers; i++) {
820  noise[i] = 0;
821  }
822 
823  for (int i = 0; i < fxd.para->nlayers; i++) {
824  cc = snprintf(name, sizeof(name), "%s/NL%d", fxd.para->outdir, i);
825  assert(cc < (int)sizeof(name));
826  FILE *f1 = fopen(name, "w");
827  assert(f1 != 0);
828  for (struct CELL *c = fxd.noise_cells; c != NULL; c = c->d_next) {
829  if (c->level != i) {
830  continue;
831  }
832  noise[i] = 1;
833  for (struct OBJECT *o = c->objects.head; o != NULL; o = o->next) {
834  output_object(f1, o);
835  }
836  }
837  cc = fclose(f1);
838  assert(cc == 0);
839  }
840 
841 #if 0
842  if (fxd.para->dim == 2) {
843  cc = snprintf(name, sizeof(name), "%s/plot", fxd.para->outdir);
844  assert(cc < (int)sizeof(name));
845  FILE *f2 = fopen(name, "w");
846  assert(f2 != 0);
847  cc = fprintf(f2, "plot 'NL'notitle");
848  assert(cc >= 0);
849  int cnt2 = 0;
850  for (struct CLUSTER *p = fxd.clusters; p != NULL; p = p->v_next) {
851  if (p->active && (fxd.para->plot_cut_off < p->object_count)) {
852  cc = fprintf(f2, ",'C%0*d'notitle", digits, cnt2);
853  assert(cc >= 0);
854  }
855  cnt2++;
856  }
857  cc = fclose(f2);
858  assert(cc == 0);
859  }
860 #endif
861 
862  if (fxd.para->dim >= 2) {
863  cc = snprintf(name, sizeof(name), "%s/plot2d", fxd.para->outdir);
864  assert(cc < (int)sizeof(name));
865  FILE *f3 = fopen(name, "w");
866  assert(f3 != 0);
867  cc = fprintf(f3, "plot ");
868  assert(cc >= 0);
869  char *fsep = "";
870  int cnt3 = 0;
871  for (struct CLUSTER *p = fxd.clusters; p != NULL; p = p->v_next) {
872  if (p->active && (p->object_count > fxd.para->plot_cut_off)) {
873  cc = fprintf(f3, "%s'C%0*d' u 2:3 w d", fsep, digits, cnt3);
874  assert(cc >= 0);
875  fsep = ", ";
876  }
877  cnt3++;
878  }
879  assert(fsep[0] != 0);
880  if (fxd.para->plot_noise) {
881  for (int i = 1; i < fxd.para->nlayers; i++) {
882  if (noise[i]) {
883  cc = fprintf(f3, "%s'NL%d' u 2:3 w d", fsep, i);
884  assert(cc >= 0);
885  }
886  }
887  }
888  cc = fclose(f3);
889  assert(cc == 0);
890  }
891  if (fxd.para->dim >= 3) {
892  cc = snprintf(name, sizeof(name), "%s/plot3d", fxd.para->outdir);
893  assert(cc < (int)sizeof(name));
894  FILE *f4 = fopen(name, "w");
895  assert(f4 != 0);
896  cc = fprintf(f4, "splot ");
897  assert(cc >= 0);
898  char *fsep = "";
899  int cnt4 = 0;
900  for (struct CLUSTER *p = fxd.clusters; p != NULL; p = p->v_next) {
901  if (p->active && (p->object_count > fxd.para->plot_cut_off)) {
902  cc = fprintf(f4, "%s'C%0*d' u 2:3:4 w d", fsep, digits, cnt4);
903  assert(cc >= 0);
904  fsep = ", ";
905  }
906  cnt4++;
907  }
908  assert(fsep[0] != 0);
909  if (fxd.para->plot_noise) {
910  for (int i = 1; i < fxd.para->nlayers; i++) {
911  if (noise[i]) {
912  cc = fprintf(f4, "%s'NL%d' u 2:3:4 w d", fsep, i);
913  assert(cc >= 0);
914  }
915  }
916  }
917  cc = fclose(f4);
918  assert(cc == 0);
919  }
920 
921  /* Make "inputdata.data" for calculating precision (Ec). */
922 
923  {
924  cc = snprintf(name, sizeof(name), "%s/inputdata.dat",
925  fxd.para->outdir);
926  assert(cc < (int)sizeof(name));
927  FILE *f5 = fopen(name, "w");
928  assert(f5 != 0);
929  int cnt5 = 0;
930  for (struct CLUSTER *p = fxd.clusters; p != NULL; p = p->v_next) {
931  if (p->active && fxd.para->plot_cut_off < p->object_count) {
932  cc = fprintf(f5, "C%0*d\n", digits, cnt5);
933  assert(cc >= 0);
934  }
935  cnt5++;
936  }
937  cc = fprintf(f5, "NL\n");
938  assert(cc >= 0);
939  cc = fclose(f5);
940  assert(cc == 0);
941  }
942 }
943 
944 /* Sets the depth-limit of each attribute. It errs if the parameter
945  value is too large. */
946 
947 static void
948 setup_flexdice(struct CELL *top)
949 {
950  fxd.depth_limits = malloc_safe(sizeof(int) * fxd.para->dim);
951  int botmax = INT_MIN;
952  for (int attr = 0; attr < fxd.DIM; attr++) {
953 #if (DATA == DATA_Z)
954  DATATYPE w = (top->max[attr] - top->min[attr]);
955  fxd.depth_limits[attr] = log2i(w + 1);
956 #else
957  fxd.depth_limits[attr] = (fxd.para->nlayers - 1);
958 #endif
959  assert(fxd.depth_limits[attr] > 0);
960  botmax = MAX(botmax, fxd.depth_limits[attr]);
961  }
962  if (botmax < (fxd.para->nlayers - 1)) {
963  fprintf(stderr, "input parameter error; layers be %d.\n", botmax);
964  assert(0);
965  exit(1);
966  }
967 
968  fxd.n_cells = 1;
969  fxd.n_objects = top->objects.count;
970  fxd.n_middle_cells = 0;
971  fxd.n_dense_cells = 0;
972  fxd.n_clusters = 0;
973  fxd.n_noise_cells = 0;
974  fxd.n_debris_cells = 0;
975  fxd.n_noise_objects = 0;
976 
977  fxd.layers = malloc_safe(sizeof(struct LAYER) * fxd.para->nlayers);
978  for (int i = 0; i < fxd.para->nlayers; i++) {
979  setup_layer(&fxd.layers[i]);
980  }
981  fxd.layers[0].object_count = top->objects.count;
982  fxd.layers[0].cell_count = 0;
983  fxd.layers[0].child_cell_count = 0;
984 
985  fxd.clusters = NULL;
986  fxd.cluster_count = 0;
987  fxd.t[0] = 0.0;
988  fxd.t[1] = 0.0;
989  fxd.t[2] = 0.0;
990 
991  setup_cell_queue(&(fxd.queue));
992 
993  fxd.noise_cells = NULL;
994  fxd.dense_cells = NULL;
995 
996  fxd.top = top;
997 }
998 
999 /* *** PHASE 1 *** */
1000 
1001 /* Distributes data objects to child cells, making child cells if
1002  needed. */
1003 
1004 static void
1005 divide_cell_and_objects(struct CELL *cell)
1006 {
1007  unsigned int path[fxd.CODE_SIZE];
1008  assert(cell->child_count == 0 && cell->hashed_children == NULL);
1009  cell->total_object_count = cell->objects.count;
1010  cell->hashed_children = make_hashtable();
1011  int nchildren = 0;
1012  struct OBJECT *o;
1013  while ((o = take_object_from_cell(cell)) != NULL) {
1014  calculate_cell_path(path, cell, o);
1015  struct CELL *c = find_child(cell, path);
1016  if (c == NULL) {
1017  c = create_cell(cell, path);
1018  set_cell_range(c, cell);
1019  nchildren++;
1020  }
1021  put_object_in_cell(&(c->objects), o);
1022  }
1023  assert(cell->objects.count == 0);
1024  assert(cell->child_count == nchildren);
1025 }
1026 
1027 /* Sets the value range of a cell. */
1028 
1029 static void
1030 set_cell_range(struct CELL *cell, struct CELL *oya)
1031 {
1032  for (int attr = 0; attr < fxd.DIM; attr++) {
1033  if (oya->level < fxd.depth_limits[attr]) {
1034  /* Divisible dimension. */
1035  int updn = BITGET(cell->path, attr);
1036  if (updn == 0) {
1037  cell->min[attr] = oya->min[attr];
1038  cell->max[attr] = oya->cen[attr];
1039  } else {
1040  cell->min[attr] = oya->cen[attr];
1041  cell->max[attr] = oya->max[attr];
1042  }
1043  cell->cen[attr] = (cell->min[attr]
1044  + ((cell->max[attr] - cell->min[attr]) / 2));
1045  } else {
1046  /* Not-divisible dimension. */
1047  cell->min[attr] = oya->min[attr];
1048  cell->max[attr] = oya->max[attr];
1049  cell->cen[attr] = oya->cen[attr];
1050  }
1051  }
1052 }
1053 
1054 static void
1055 undo_divide_cell(struct CELL *cell)
1056 {
1057  assert(cell->objects.count == 0);
1058  for (struct CELL *c = cell->children; c != NULL; c = c->c_next) {
1059  assert(c->density == MADA);
1060  relink_object_lists(&(cell->objects), &(c->objects));
1061  c->density = DEBRIS;
1062  fxd.n_debris_cells++;
1063  }
1064 }
1065 
1066 /* Calculates a relative path to the cell for an object. It is a bit
1067  vector on attributes, indicating up(1) or down(0) from the
1068  parent. */
1069 
1070 static void
1071 calculate_cell_path(unsigned int *path, struct CELL *oya, struct OBJECT *obj)
1072 {
1073  bits_clear(path);
1074  for (int attr = 0; attr < fxd.DIM; attr++) {
1075  if (oya->level < fxd.depth_limits[attr]) {
1076  /* Dimension is divisible. */
1077  if (oya->cen[attr] < obj->value[attr]) {
1078  BITSET(path, attr);
1079  }
1080  }
1081  }
1082 }
1083 
1084 static void
1085 flip_path(unsigned int *xpath, unsigned int *path, int attr)
1086 {
1087  bits_copy(xpath, path);
1088  BITXOR(xpath, attr);
1089 }
1090 
1091 /* Makes a direct adjacency of a CELL for each direction. The
1092  flipping the bit of the path makes the new path to the near side of
1093  the sibling cells. */
1094 
1095 static void
1096 set_direct_adjacency(struct CELL *cell)
1097 {
1098  unsigned int xpath[fxd.CODE_SIZE];
1099  struct CELL *oya = cell->parent;
1100  if (oya != NULL) {
1101  for (int dir = 0; dir < fxd.DIM_UPDN; dir++) {
1102  assert(cell->adjacents[dir] == NULL);
1103  int attr = (dir / 2);
1104  int updn = BITGET(cell->path, attr);
1105  if ((oya->level < fxd.depth_limits[attr]) && updn != (dir & 1)) {
1106  /* Take the other half of the cell. */
1107  flip_path(xpath, cell->path, attr);
1108  struct CELL *c = find_child(oya, xpath);
1109  if (c != NULL && c->density != SPARSE) {
1110  cell->adjacents[dir] = c;
1111  }
1112  } else {
1113  /* Take the adjacent cell of the parent. */
1114  struct CELL *ac = oya->adjacents[dir];
1115  if (ac != NULL) {
1116  switch (ac->density) {
1117  case MADA:
1118  assert(ac->density != MADA);
1119  break;
1120  case SPARSE:
1121  /*nothing*/
1122  break;
1123  case DENSE:
1124  cell->adjacents[dir] = ac;
1125  break;
1126  case MIDDLE:
1127  if (oya->level < fxd.depth_limits[attr]) {
1128  flip_path(xpath, cell->path, attr);
1129  } else {
1130  bits_copy(xpath, cell->path);
1131  }
1132  struct CELL *c = find_child(ac, xpath);
1133  if (c != NULL && c->density != SPARSE) {
1134  cell->adjacents[dir] = c;
1135  }
1136  break;
1137  case DEBRIS:
1138  assert(ac->density != DEBRIS);
1139  break;
1140  default:
1141  assert(false);
1142  break;
1143  }
1144  }
1145  }
1146  }
1147  }
1148 }
1149 
1150 /* *** PHASE 2 *** */
1151 
1152 /* Creates clusters of the dense cells, and unions clusters among the
1153  adjacent ones. */
1154 
1155 static void
1156 make_clusters(void)
1157 {
1158  int id = 0;
1159  for (struct CELL *p = fxd.dense_cells; p != NULL; p = p->d_next) {
1160  if (p->cluster == NULL) {
1161  struct CLUSTER *cluster = malloc_safe(sizeof(struct CLUSTER));
1162  setup_cluster(cluster, id);
1163  link_cells_to_cluster(cluster, p);
1164  id++;
1165  }
1166  }
1167 }
1168 
1169 static void
1170 link_cells_to_cluster(struct CLUSTER *cluster, struct CELL *cell)
1171 {
1172  assert(cell->cluster == NULL);
1173  assert(cell->density == DENSE);
1174  assert(cell->density != DENSE || cell->objects.count != 0);
1175  put_cell_in_cluster(cluster, cell);
1176  for (int dir = 0; dir < fxd.DIM_UPDN; dir++) {
1177  struct CELL *ac = cell->adjacents[dir];
1178  assert(ac == NULL || (ac->density != MADA && ac->density != DEBRIS));
1179  if (ac == NULL) {
1180  /*nothing*/ ;
1181  } else if (ac->density == SPARSE || ac->density == MIDDLE) {
1182  /*nothing*/ ;
1183  } else if (ac->cluster == NULL) {
1184  link_cells_to_cluster(cluster, ac);
1185  } else {
1186  merge_clusters(cluster, ac->cluster);
1187  }
1188  }
1189 }
1190 
1191 static void
1192 merge_clusters(struct CLUSTER *x0, struct CLUSTER *y0)
1193 {
1194  /* Do nothing if x0 and y0 are the same cluster. */
1195  for (struct CLUSTER *p = x0->n_next; p != x0; p = p->n_next) {
1196  if (p == y0) {
1197  return;
1198  }
1199  }
1200  struct CLUSTER *x1 = x0->n_next;
1201  struct CLUSTER *y1 = y0->n_next;
1202  /* (x1->..->x0->y1->..->y0->) */
1203  x0->n_next = y1;
1204  y0->n_next = x1;
1205 }
1206 
1207 /* Marks representatives of clusters active, and others inactive. It
1208  also counts the number of objects and clusters. Clusters are
1209  active initially. */
1210 
1211 static void
1212 finish_clusters(void)
1213 {
1214  int nclusters = 0;
1215  for (struct CLUSTER *pp = fxd.clusters; pp != NULL; pp = pp->v_next) {
1216  if (!pp->active) {
1217  continue;
1218  }
1219  nclusters++;
1220  int oo = 0;
1221  {
1222  struct CLUSTER *q = pp;
1223  do {
1224  assert(q->active);
1225  if (q != pp) {
1226  q->active = false;
1227  }
1228  for (struct CELL *c = q->cells; c != NULL; c = c->x_next) {
1229  oo += c->objects.count;
1230  }
1231  q = q->n_next;
1232  } while (q != pp);
1233  }
1234  assert(pp->object_count == 0);
1235  pp->object_count = oo;
1236  }
1237  assert(fxd.n_clusters == 0);
1238  fxd.n_clusters = nclusters;
1239 }
1240 
1241 /* *** PRINTERS *** */
1242 
1243 static void
1244 print_depth_limit(void)
1245 {
1246  printf("bottom-levels:");
1247  for (int attr = 0; attr < fxd.DIM; attr++) {
1248  printf(" [%d]=%d", attr, fxd.depth_limits[attr]);
1249  }
1250  printf("\n");
1251 }
1252 
1253 void
1254 print_parameters(struct INPARA *para)
1255 {
1256  printf("input-file: %s\n", para->infile);
1257  printf("parameters:"
1258  " dense-min=%d dense-factor=%.2f bottom-level=%d hash-size=%d\n",
1259  para->dmin , para->dfac, para->nlayers, para->hashsize);
1260 }
1261 
1262 void
1263 print_input(struct CELL *top)
1264 {
1265  printf("-----Input Cell information-----\n");
1266  printf("The number of data objects = %ld\n", top->objects.count);
1267  printf("top->nighbor.dl\n");
1268  printf("(attribute#, min, center, max)\n");
1269  for (int i = 0; i < fxd.DIM; i++) {
1270  printf("(%s%3d, ", (i == 0 ? "" : " "), i);
1271 #if (DATA == DATA_Z)
1272  printf("%4d, ", top->min[i]);
1273  printf("%4d, ", top->cen[i]);
1274  printf("%4d)", top->max[i]);
1275 #else
1276  printf("%f, ", top->min[i]);
1277  printf("%f, ", top->cen[i]);
1278  printf("%f)", top->max[i]);
1279 #endif
1280  }
1281  printf("\n");
1282 }
Definition: flexdice.h:66
Definition: flexdice.h:79