java - Hadoop Word count - Compare multiple text files' contents with a list of words -
i write program using hadoop , mapreduce.
the objective of program read ten text files , detect if content contains top 100 common letters. if there is, counter word increases 1.
the final output needs below:
file 1: 1,2,3,4.... (where 1 stands frequency of first word in top 100 common word list ,2 stands frequency of second word in top 100 common word list...)
file 2: 5,6,10,9.....(where 5 stands frequency of first word in top 100 common word list ,6 stands frequency of second word in top 100 common word list...)
etc
below complete code.yet, 1 output result 0.
why wrong? can me?
many thanks
import java.io.ioexception; import java.util.arrays; import java.util.hashmap; import java.util.list; import java.util.stringtokenizer; import org.apache.hadoop.conf.configuration; import org.apache.hadoop.fs.path; import org.apache.hadoop.io.intwritable; import org.apache.hadoop.io.longwritable; import org.apache.hadoop.io.text; import org.apache.hadoop.mapreduce.job; import org.apache.hadoop.mapreduce.mapper; import org.apache.hadoop.mapreduce.reducer; import org.apache.hadoop.mapreduce.lib.input.fileinputformat; import org.apache.hadoop.mapreduce.lib.input.filesplit; import org.apache.hadoop.mapreduce.lib.output.fileoutputformat; public class wordcount { public static class tokenizermapper extends mapper<object, text, text, intwritable> { private final static intwritable 1 = new intwritable(1); private text word = new text(); public void map(longwritable key , text value, context context) throws ioexception, interruptedexception { filesplit split = (filesplit) context.getinputsplit(); string filename = split.getpath().getname().tostring(); string[] top100word = { "the", "be", "to", "of", "and", "a", "in", "that", "have", "i", "it", "for", "not", "on", "with", "he", "as", "you", "do", "at", "this", "but", "his", "by", "from", "they", "we", "say", "her", "she", "or", "an", "will", "my", "one", "all", "would", "there", "their", "what", "so", "up", "out", "if", "about", "who", "get", "which", "go", "me", "when", "make", "can", "like", "time", "no", "just", "him", "know", "take", "people", "into", "year", "your", "good", "some", "could", "them", "see", "other", "than", "then", "now", "look", "only", "come", "its", "over", "think", "also", "back", "after", "use", "two", "how", "our", "work", "first", "well", "way", "even", "new", "want", "because", "any", "these", "give", "day", "most", "us" }; list<string> stopwordlist=arrays.aslist(top100word); string s = value.tostring(); (string word : s.split("\\w+")) { if ((word.length() > 0)&&(stopwordlist.equals(word))) { context.write(new text(filename+word), new intwritable(1)); } } } } public static class intsumreducer extends reducer<text,intwritable,text,intwritable> { private intwritable result = new intwritable(); public void reduce(text key, iterable<intwritable> values, context context ) throws ioexception, interruptedexception { int sum = 0; (intwritable val : values) { sum += val.get(); } result.set(sum); context.write(key, result); } } public static void main(string[] args) throws exception { configuration conf = new configuration(); job job = job.getinstance(conf, "word count"); job.setjarbyclass(wordcount.class); job.setmapperclass(tokenizermapper.class); job.setcombinerclass(intsumreducer.class); job.setreducerclass(intsumreducer.class); job.setoutputkeyclass(text.class); job.setoutputvalueclass(intwritable.class); fileinputformat.addinputpath(job, new path(args[0])); fileoutputformat.setoutputpath(job, new path(args[1])); system.exit(job.waitforcompletion(true) ? 0 : 1); } }
Comments
Post a Comment