참조
[1] : https://github.com/jhofman/icwsm2010_tutorial/blob/master/hstream.py
[2] : http://www.michael-noll.com/tutorials/writing-an-hadoop-mapreduce-program-in-python
[3] : http://jakehofman.com/icwsm2010
파이썬으로 hadoop streaming을 편하게 할수 있는 파이썬 클래스 소개
# 실행
./bin/hadoop jar contrib/streaming/hadoop-0.20.2-streaming.jar \
-file .../wordcount.py \
-file .../hstream.py \
-mapper '.../wordcount.py -m' \
-reducer '.../wordcount.py -r' \
-input input_data \
-output output_data
# wordcount.py
#!/usr/bin/env python
from hstream import HStream
import sys
import re
from collections import defaultdict
class WordCount(HStream):
def mapper(self, record):
for word in " ".join(record).split():
self.write_output((word,1))
def reducer(self, key, records):
total = 0
for record in records:
word, count = record
total += int(count)
self.write_output((word,total))
if __name__== '__main__':
WordCount()
댓글 없음:
댓글 쓰기