KMR
wordcountpy.py
1 # Word Count (2015-06-13)
2 
3 ## This ranks the words by their occurrence count in the "../LICENSE"
4 ## file. It can be run under MPI as follows:
5 ## $ mpiexec -n 4 python wordcountpy.py
6 
7 from mpi4py import MPI
8 import kmr4py
9 import re
10 
11 file_name = "../LICENSE"
12 
13 kmr = kmr4py.KMR("world")
14 
15 def read_words_from_a_file(kv, kvi, kvo, i, *_data):
16  file_ = open(file_name, "r")
17  for line in file_:
18  words = re.split(r"\W+", line.strip())
19  for w in words:
20  if (w != ""):
21  kvo.add(w, 1)
22  file_.close()
23 
24 def print_top_five((k, v), kvi, kvo, i, *_data):
25  ## (NO FIELD VALUE IN KMR.MR BECAUSE IT IS A DUMMY).
26  if (kmr.rank == 0 and i < 5):
27  print "#%s=%d" % (v, int(0 - k))
28 
29 def sum_counts_for_a_word(kvvec, kvi, kvo, *_data):
30  count = 0
31  (k0, _) = kvvec[0]
32  for (_, v) in kvvec:
33  count += v
34  kvo.add(k0, -count)
35 
36 if (kmr.rank == 0): print "Ranking words..."
37 
38 kvs0 = kmr.emptykvs.map_once(False, read_words_from_a_file, key="cstring")
39 kvs1 = kvs0.shuffle()
40 kvs2 = kvs1.reduce(sum_counts_for_a_word, key="cstring", value="integer")
41 kvs3 = kvs2.reverse()
42 kvs4 = kvs3.sort()
43 kvs4.map(print_top_five, output=False, nothreading=True)
44 
45 kmr.dismiss()
46 kmr4py.fin()