KMR
kmeanspy.py
1 # K-Means (2015-06-15)
2 
3 ## An example of K-Means implementation.
4 ## It can be run under MPI as follows:
5 ## $ mpiexec -n 4 python kmeanspy.py
6 
7 import random
8 from mpi4py import MPI
9 import kmr4py
10 
11 class K_Means:
12  def __init__(self):
13  # Change the following variables
14  self.n_iteration = 10
15  self.grid_size = 1000
16  self.dim = 3
17  self.n_points = 10000
18  self.n_means = 100
19  self.means = None
20  self.points = None
21 
22  def __str__(self):
23  ostr = '#### Configuration ###########################\n'
24  ostr += 'Iteration = %d\n' % (self.n_iteration)
25  ostr += 'Grid size = %d\n' % (self.grid_size)
26  ostr += 'Dimension = %d\n' % (self.dim)
27  ostr += 'Number of clusters = %d\n' % (self.n_means)
28  ostr += 'Number of points = %d\n' % (self.n_points)
29  ostr += '##############################################'
30  return ostr
31 
32  def init_means(self):
33  self.means = []
34  self._fill_randoms(self.means, self.n_means)
35 
36  def init_points(self):
37  self.points = []
38  self._fill_randoms(self.points, self.n_points)
39 
40  def _fill_randoms(self, tlst, count):
41  for _ in range(0, count):
42  lst = []
43  for _ in range(0, self.dim):
44  lst.append(random.randint(0, self.grid_size - 1))
45  tlst.append(lst)
46 
47 def calc_sq_dist(v1, v2):
48  sum_ = 0
49  for (x, y) in zip(v1, v2):
50  sum_ += (x - y) * (x - y)
51  return sum_
52 
53 # Emit Key:id of point(integer), Value:a point(list of integer)
54 def load_points(kv, kvi, kvo, i):
55  del kv, kvi, i
56  for (idp, point) in enumerate(kmeans.points):
57  kvo.add(idp, point)
58 
59 # Emit Key:id of nearest group, Value:a point(list of integer)
60 def calc_cluster((k, v), kvi, kvo, i):
61  del k, kvi, i
62  min_id = 0
63  min_dst = kmeans.grid_size * kmeans.grid_size
64  for (idm, mean) in enumerate(kmeans.means):
65  dst = calc_sq_dist(v, mean)
66  if dst < min_dst:
67  min_id = idm
68  min_dst = dst
69  kvo.add(min_id, v)
70 
71 # Emit nothing
72 def copy_center((k, v), kvi, kvo, i):
73  del kvi, kvo, i
74  kmeans.means[k] = v
75 
76 # Emit Key:id of group(integer),
77 # Value:coordinates of center of the group(list of integer)
78 def update_cluster(kvvec, kvi, kvo):
79  del kvi
80  sum_ = []
81  for d in range(0, kmeans.dim):
82  sum_.append(0)
83  for (_, v) in kvvec:
84  for d in range(0, kmeans.dim):
85  sum_[d] += v[d]
86  avg = [x / (len(kvvec)) for x in sum_]
87  kvo.add_kv(kvvec[0][0], avg)
88 
89 
90 #### main
91 comm = MPI.COMM_WORLD
92 kmr = kmr4py.KMR("world")
93 kmeans = K_Means()
94 random.seed(1)
95 
96 if comm.rank == 0:
97  print 'Number of processes = %d' % (comm.size)
98  print kmeans
99  kmeans.init_means()
100 kmeans.means = comm.bcast(kmeans.means, root=0)
101 kmeans.init_points()
102 
103 for _ in range(0, kmeans.n_iteration):
104  kvs0 = kmr.emptykvs.map_once(False, load_points, key="integer")
105  kvs1 = kvs0.map(calc_cluster, key="integer")
106  kvs2 = kvs1.shuffle()
107  kvs3 = kvs2.reduce(update_cluster, key="integer")
108  kvs4 = kvs3.replicate()
109  kvs4.map(copy_center)
110 
111  if comm.rank == 0:
112  print 'Cluster coordinates'
113  for m in kmeans.means:
114  print m
115 
116 kmr.dismiss()
117 kmr4py.fin()
def _fill_randoms(self, tlst, count)
Definition: kmeanspy.py:40