java - Hadoop Word count - Compare multiple text files' contents with a list of words -

i write program using hadoop , mapreduce.

the objective of program read ten text files , detect if content contains top 100 common letters. if there is, counter word increases 1.

the final output needs below:

file 1: 1,2,3,4.... (where 1 stands frequency of first word in top 100 common word list ,2 stands frequency of second word in top 100 common word list...)

file 2: 5,6,10,9.....(where 5 stands frequency of first word in top 100 common word list ,6 stands frequency of second word in top 100 common word list...)

etc

below complete code.yet, 1 output result 0.

why wrong? can me?

many thanks

import java.io.ioexception; import java.util.arrays; import java.util.hashmap; import java.util.list; import java.util.stringtokenizer;  import org.apache.hadoop.conf.configuration; import org.apache.hadoop.fs.path; import org.apache.hadoop.io.intwritable; import org.apache.hadoop.io.longwritable; import org.apache.hadoop.io.text; import org.apache.hadoop.mapreduce.job; import org.apache.hadoop.mapreduce.mapper; import org.apache.hadoop.mapreduce.reducer; import org.apache.hadoop.mapreduce.lib.input.fileinputformat; import org.apache.hadoop.mapreduce.lib.input.filesplit; import org.apache.hadoop.mapreduce.lib.output.fileoutputformat;  public class wordcount   {     public static class tokenizermapper    extends mapper<object, text, text, intwritable>     {      private final static intwritable 1 = new intwritable(1);     private text word = new text();       public void map(longwritable key , text value, context context) throws      ioexception, interruptedexception {      filesplit split = (filesplit) context.getinputsplit();     string filename = split.getpath().getname().tostring();      string[] top100word = { "the", "be", "to", "of", "and", "a", "in",      "that", "have", "i", "it", "for", "not", "on", "with", "he", "as",      "you", "do", "at", "this", "but", "his", "by", "from", "they", "we",      "say", "her", "she", "or", "an", "will", "my", "one", "all", "would",      "there", "their", "what", "so", "up", "out", "if", "about", "who",      "get", "which", "go", "me", "when", "make", "can", "like", "time", "no",      "just", "him", "know", "take", "people", "into", "year", "your", "good",      "some", "could", "them", "see", "other", "than", "then", "now", "look",      "only", "come", "its", "over", "think", "also", "back", "after", "use",      "two", "how", "our", "work", "first", "well", "way", "even", "new",      "want", "because", "any", "these", "give", "day", "most", "us" };     list<string> stopwordlist=arrays.aslist(top100word);     string s = value.tostring();      (string word : s.split("\\w+"))      {                    if ((word.length() > 0)&&(stopwordlist.equals(word)))          {            context.write(new text(filename+word), new intwritable(1));         }            }      }    }      public static class intsumreducer     extends reducer<text,intwritable,text,intwritable> {     private intwritable result = new intwritable();      public void reduce(text key, iterable<intwritable> values,               context context               ) throws ioexception, interruptedexception {     int sum = 0;     (intwritable val : values) {     sum += val.get();     }     result.set(sum);     context.write(key, result);    }    }  public static void main(string[] args) throws exception { configuration conf = new configuration(); job job = job.getinstance(conf, "word count"); job.setjarbyclass(wordcount.class); job.setmapperclass(tokenizermapper.class); job.setcombinerclass(intsumreducer.class); job.setreducerclass(intsumreducer.class); job.setoutputkeyclass(text.class); job.setoutputvalueclass(intwritable.class); fileinputformat.addinputpath(job, new path(args[0])); fileoutputformat.setoutputpath(job, new path(args[1])); system.exit(job.waitforcompletion(true) ? 0 : 1);  }  }

Search This Blog

New Generation Education

java - Hadoop Word count - Compare multiple text files' contents with a list of words -

Comments

Post a Comment

Popular posts from this blog

php - Permission denied. Laravel linux server -

google bigquery - Delta between query execution time and Java query call to finish -

python - Pandas two dataframes multiplication? -