diff --git a/%3E %3Cinput%3E %3CintermediateDir%3E %3CoutDir%3E b/%3E %3Cinput%3E %3CintermediateDir%3E %3CoutDir%3E new file mode 100644 index 0000000..84d951f --- /dev/null +++ b/%3E %3Cinput%3E %3CintermediateDir%3E %3CoutDir%3E @@ -0,0 +1,231 @@ + + SSUUMMMMAARRYY OOFF LLEESSSS CCOOMMMMAANNDDSS + + Commands marked with * may be preceded by a number, _N. + Notes in parentheses indicate the behavior if _N is given. + + h H Display this help. + q :q Q :Q ZZ Exit. + --------------------------------------------------------------------------- + + MMOOVVIINNGG + + e ^E j ^N CR * Forward one line (or _N lines). + y ^Y k ^K ^P * Backward one line (or _N lines). + f ^F ^V SPACE * Forward one window (or _N lines). + b ^B ESC-v * Backward one window (or _N lines). + z * Forward one window (and set window to _N). + w * Backward one window (and set window to _N). + ESC-SPACE * Forward one window, but don't stop at end-of-file. + d ^D * Forward one half-window (and set half-window to _N). + u ^U * Backward one half-window (and set half-window to _N). + ESC-) RightArrow * Left one half screen width (or _N positions). + ESC-( LeftArrow * Right one half screen width (or _N positions). + F Forward forever; like "tail -f". + r ^R ^L Repaint screen. + R Repaint screen, discarding buffered input. + --------------------------------------------------- + Default "window" is the screen height. + Default "half-window" is half of the screen height. + --------------------------------------------------------------------------- + + SSEEAARRCCHHIINNGG + + /_p_a_t_t_e_r_n * Search forward for (_N-th) matching line. + ?_p_a_t_t_e_r_n * Search backward for (_N-th) matching line. + n * Repeat previous search (for _N-th occurrence). + N * Repeat previous search in reverse direction. + ESC-n * Repeat previous search, spanning files. + ESC-N * Repeat previous search, reverse dir. & spanning files. + ESC-u Undo (toggle) search highlighting. + &_p_a_t_t_e_r_n * Display only matching lines + --------------------------------------------------- + Search patterns may be modified by one or more of: + ^N or ! Search for NON-matching lines. + ^E or * Search multiple files (pass thru END OF FILE). + ^F or @ Start search at FIRST file (for /) or last file (for ?). + ^K Highlight matches, but don't move (KEEP position). + ^R Don't use REGULAR EXPRESSIONS. + --------------------------------------------------------------------------- + + JJUUMMPPIINNGG + + g < ESC-< * Go to first line in file (or line _N). + G > ESC-> * Go to last line in file (or line _N). + p % * Go to beginning of file (or _N percent into file). + t * Go to the (_N-th) next tag. + T * Go to the (_N-th) previous tag. + { ( [ * Find close bracket } ) ]. + } ) ] * Find open bracket { ( [. + ESC-^F _<_c_1_> _<_c_2_> * Find close bracket _<_c_2_>. + ESC-^B _<_c_1_> _<_c_2_> * Find open bracket _<_c_1_> + --------------------------------------------------- + Each "find close bracket" command goes forward to the close bracket + matching the (_N-th) open bracket in the top line. + Each "find open bracket" command goes backward to the open bracket + matching the (_N-th) close bracket in the bottom line. + + m_<_l_e_t_t_e_r_> Mark the current position with . + '_<_l_e_t_t_e_r_> Go to a previously marked position. + '' Go to the previous position. + ^X^X Same as '. + --------------------------------------------------- + A mark is any upper-case or lower-case letter. + Certain marks are predefined: + ^ means beginning of the file + $ means end of the file + --------------------------------------------------------------------------- + + CCHHAANNGGIINNGG FFIILLEESS + + :e [_f_i_l_e] Examine a new file. + ^X^V Same as :e. + :n * Examine the (_N-th) next file from the command line. + :p * Examine the (_N-th) previous file from the command line. + :x * Examine the first (or _N-th) file from the command line. + :d Delete the current file from the command line list. + = ^G :f Print current file name. + --------------------------------------------------------------------------- + + MMIISSCCEELLLLAANNEEOOUUSS CCOOMMMMAANNDDSS + + -_<_f_l_a_g_> Toggle a command line option [see OPTIONS below]. + --_<_n_a_m_e_> Toggle a command line option, by name. + __<_f_l_a_g_> Display the setting of a command line option. + ___<_n_a_m_e_> Display the setting of an option, by name. + +_c_m_d Execute the less cmd each time a new file is examined. + + !_c_o_m_m_a_n_d Execute the shell command with $SHELL. + |XX_c_o_m_m_a_n_d Pipe file between current pos & mark XX to shell command. + v Edit the current file with $VISUAL or $EDITOR. + V Print version number of "less". + --------------------------------------------------------------------------- + + OOPPTTIIOONNSS + + Most options may be changed either on the command line, + or from within less by using the - or -- command. + Options may be given in one of two forms: either a single + character preceded by a -, or a name preceeded by --. + + -? ........ --help + Display help (from command line). + -a ........ --search-skip-screen + Forward search skips current screen. + -A ........ --SEARCH-SKIP-SCREEN + Forward search always skips target line. + -b [_N] .... --buffers=[_N] + Number of buffers. + -B ........ --auto-buffers + Don't automatically allocate buffers for pipes. + -c ........ --clear-screen + Repaint by clearing rather than scrolling. + -d ........ --dumb + Dumb terminal. + -D [_x_n_._n] . --color=_x_n_._n + Set screen colors. (MS-DOS only) + -e -E .... --quit-at-eof --QUIT-AT-EOF + Quit at end of file. + -f ........ --force + Force open non-regular files. + -F ........ --quit-if-one-screen + Quit if entire file fits on first screen. + -g ........ --hilite-search + Highlight only last match for searches. + -G ........ --HILITE-SEARCH + Don't highlight any matches for searches. + -h [_N] .... --max-back-scroll=[_N] + Backward scroll limit. + -i ........ --ignore-case + Ignore case in searches that do not contain uppercase. + -I ........ --IGNORE-CASE + Ignore case in all searches. + -j [_N] .... --jump-target=[_N] + Screen position of target lines. + -J ........ --status-column + Display a status column at left edge of screen. + -k [_f_i_l_e] . --lesskey-file=[_f_i_l_e] + Use a lesskey file. + -K --quit-on-intr + Exit less in response to ctrl-C. + -L ........ --no-lessopen + Ignore the LESSOPEN environment variable. + -m -M .... --long-prompt --LONG-PROMPT + Set prompt style. + -n -N .... --line-numbers --LINE-NUMBERS + Don't use line numbers. + -o [_f_i_l_e] . --log-file=[_f_i_l_e] + Copy to log file (standard input only). + -O [_f_i_l_e] . --LOG-FILE=[_f_i_l_e] + Copy to log file (unconditionally overwrite). + -p [_p_a_t_t_e_r_n] --pattern=[_p_a_t_t_e_r_n] + Start at pattern (from command line). + -P [_p_r_o_m_p_t] --prompt=[_p_r_o_m_p_t] + Define new prompt. + -q -Q .... --quiet --QUIET --silent --SILENT + Quiet the terminal bell. + -r -R .... --raw-control-chars --RAW-CONTROL-CHARS + Output "raw" control characters. + -s ........ --squeeze-blank-lines + Squeeze multiple blank lines. + -S ........ --chop-long-lines + Chop long lines. + -t [_t_a_g] .. --tag=[_t_a_g] + Find a tag. + -T [_t_a_g_s_f_i_l_e] --tag-file=[_t_a_g_s_f_i_l_e] + Use an alternate tags file. + -u -U .... --underline-special --UNDERLINE-SPECIAL + Change handling of backspaces. + -V ........ --version + Display the version number of "less". + -w ........ --hilite-unread + Highlight first new line after forward-screen. + -W ........ --HILITE-UNREAD + Highlight first new line after any forward movement. + -x [_N[,...]] --tabs=[_N[,...]] + Set tab stops. + -X ........ --no-init + Don't use termcap init/deinit strings. + --no-keypad + Don't use termcap keypad init/deinit strings. + -y [_N] .... --max-forw-scroll=[_N] + Forward scroll limit. + -z [_N] .... --window=[_N] + Set size of window. + -" [_c[_c]] . --quotes=[_c[_c]] + Set shell quote characters. + -~ ........ --tilde + Don't display tildes after end of file. + -# [_N] .... --shift=[_N] + Horizontal scroll amount (0 = one half screen width) + ........ --no-keypad + Don't send keypad init/deinit sequence. + ........ --follow-name + The F command changes files if the input file is renamed. + + + --------------------------------------------------------------------------- + + LLIINNEE EEDDIITTIINNGG + + These keys can be used to edit text being entered + on the "command line" at the bottom of the screen. + + RightArrow ESC-l Move cursor right one character. + LeftArrow ESC-h Move cursor left one character. + CNTL-RightArrow ESC-RightArrow ESC-w Move cursor right one word. + CNTL-LeftArrow ESC-LeftArrow ESC-b Move cursor left one word. + HOME ESC-0 Move cursor to start of line. + END ESC-$ Move cursor to end of line. + BACKSPACE Delete char to left of cursor. + DELETE ESC-x Delete char under cursor. + CNTL-BACKSPACE ESC-BACKSPACE Delete word to left of cursor. + CNTL-DELETE ESC-DELETE ESC-X Delete word under cursor. + CNTL-U ESC (MS-DOS only) Delete entire line. + UpArrow ESC-k Retrieve previous command line. + DownArrow ESC-j Retrieve next command line. + TAB Complete filename & cycle. + SHIFT-TAB ESC-TAB Complete filename & reverse cycle. + CNTL-L Complete filename, list all. + + diff --git a/DoublePair.java b/DoublePair.java new file mode 100644 index 0000000..b1fb494 --- /dev/null +++ b/DoublePair.java @@ -0,0 +1,89 @@ +/* + * CS 61C Fall 2013 Project 1 + * + * DoublePair.java is a class which stores two doubles and + * implements the Writable interface. It can be used as a + * custom value for Hadoop. To use this as a key, you can + * choose to implement the WritableComparable interface, + * although that is not necessary for credit. + */ + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; + +import org.apache.hadoop.io.Writable; + +public class DoublePair implements Writable { + // Declare any variables here + public double double1; + public double double2; + /** + * Constructs a DoublePair with both doubles set to zero. + */ + public DoublePair() { + // YOUR CODE HERE + this.double1=0; + this.double2=0; + } + + /** + * Constructs a DoublePair containing double1 and double2. + */ + public DoublePair(double double1, double double2) { + // YOUR CODE HERE + this.double1=double1; + this.double2=double2; + } + + /** + * Returns the value of the first double. + */ + public double getDouble1() { + // YOUR CODE HERE + return this.double1; + + } + + /** + * Returns the value of the second double. + */ + public double getDouble2() { + // YOUR CODE HERE + return this.double2; + } + + /** + * Sets the first double to val. + */ + public void setDouble1(double val) { + // YOUR CODE HERE + this.double1=val; + } + + /** + * Sets the second double to val. + */ + public void setDouble2(double val) { + // YOUR CODE HERE + this.double2=val; + } + + /** + * write() is required for implementing Writable. + */ + public void write(DataOutput out) throws IOException { + // YOUR CODE HERE + out.writeDouble(double1); + out.writeDouble(double2); + } + + /** + * readFields() is required for implementing Writable. + */ + public void readFields(DataInput in) throws IOException { + // YOUR CODE HERE + double1=in.readDouble(); + double2=in.readDouble(); + } +} diff --git a/Func.java b/Func.java new file mode 100644 index 0000000..2bfd38e --- /dev/null +++ b/Func.java @@ -0,0 +1,5 @@ + +/** A class representing a function from doubles to doubles. */ +public abstract class Func { + public abstract double f(double d); +} diff --git a/Importer.java b/Importer.java new file mode 100644 index 0000000..744a32e --- /dev/null +++ b/Importer.java @@ -0,0 +1,183 @@ +/* Written by Ariel Rabkin , 2011. + * Copyright 2011, the Regents of the University of California. + + Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +import java.io.*; +import java.security.MessageDigest; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.compress.*; +import org.apache.hadoop.io.compress.bzip2.CBZip2InputStream; + +/** + * Converts text files to sequence files, suitably for cs61c project 1, Spring 201 + * + * Usage: Importer [output] + * If invoked on a text file, converts that file to compressed sequence file, writing the output + * in output dir. + * If invoked on a directory, recursively scans that directory and subdirs for .txt + * files, storing output to output dir. + * + * Each input file is split at boundaries, where a boundary is a line containing + * exactly the text: "---END.OF.DOCUMENT---" + * + * Will also process .bz2 files, first decompressing them. + * + * Default output dir is "convertedOut" + * + * Written by Ariel Rabkin, asrabkin@gmail.com + * Licensed under the terms of the New BSD License. + * + */ +public class Importer { + + static SequenceFile.Writer seqFileWriter; + static long totalBytes = 0; + static long totalRecords = 0; + static long files = 0; + static File outDir = new File("convertedOut"); + public static void main(String[] args) { + try { + if(args.length < 1) { + System.err.println("can't run. Not enough args. Need to specify input file or dir"); + System.exit(-1); + } else + System.out.println("starting scan of " + args[0]); + + if(args.length > 1) + outDir = new File(args[1]); + System.out.println("dumping output to " + outDir.getAbsolutePath()); + + lookForFiles(new File(args[0])); + long avgRecLength = totalBytes / totalRecords; + System.out.println("total data, uncompressed was " + totalBytes/ (1024 * 1024) + " MB"); + System.out.println("total of " + totalRecords + " records. (Avg uncompressed size " + avgRecLength + " bytes)"); + } catch(Exception e) { + e.printStackTrace(); + } + } + + public static Text hash(Text content) throws Exception { + StringBuilder sb = new StringBuilder(); + sb.append("post_"); + + MessageDigest md = MessageDigest.getInstance("MD5"); + + md.update(content.getBytes(), 0, content.getLength()); + byte[] bytes = md.digest(); + for(int i=0; i < bytes.length; ++i) { + if( (bytes[i] & 0xF0) == 0) + sb.append('0'); + sb.append( Integer.toHexString(0xFF & bytes[i]) ); + } + return new Text(sb.toString()); + } + + static void lookForFiles(File file) throws Exception { + if(file.isDirectory()) { + File[] contents = file.listFiles(); + if(contents == null) { + System.out.println("WARN: null list of contents for " + file.getAbsolutePath()); + return; + } + for(File sub: contents) + lookForFiles(sub); + } else { + if(file.getName().endsWith(".bz2") || file.getName().contains(".txt")) + copyFile(file); + } + } + + public static void copyFile(File file) throws Exception { +// String TEST_PREFIX = ""; + File destFile = new File(outDir,file.getName()+".seq"); + Path dest = new Path(destFile.getAbsolutePath()); + + Configuration conf = new Configuration(); + FileSystem fileSys = org.apache.hadoop.fs.FileSystem.get(new java.net.URI(conf.get("fs.default.name")),conf); + CompressionCodec codec = new DefaultCodec(); + fileSys.mkdirs(dest.getParent()); + FSDataOutputStream outputStr = fileSys.create(dest); + seqFileWriter = SequenceFile.createWriter(conf, outputStr, + Text.class, Text.class, + SequenceFile.CompressionType.BLOCK, codec); + String filename = file.getName(); + InputStream in = new BufferedInputStream(new FileInputStream(file)); + if(filename.endsWith(".bz2")) { + in.read(); + in.read(); //snarf header + in = new CBZip2InputStream(in); + } + BufferedReader br = new BufferedReader(new InputStreamReader(in, "US-ASCII")); + + System.out.println("working on file " + file); + int records = 0; + long bytes = 0, bytes_since_status = 0; + long startTime= System.currentTimeMillis(); + String s = null; + Text content = new Text(); + while( (s = br.readLine()) != null) { + if(s.startsWith("---END.OF.DOCUMENT---")) { + Text name = new Text(hash(content)); + seqFileWriter.append(name, content); + records ++; + content = new Text(); + } else { + byte[] line_as_bytes = (s+ " ").getBytes(); + for(byte b: line_as_bytes) { + assert b < 128: "found an unexpected high-bit set"; + } + + content.append(line_as_bytes, 0, line_as_bytes.length); + bytes += line_as_bytes.length; + /* + bytes_since_status += line_as_bytes.length; + if(bytes_since_status > 10 * 1024 * 1024) { //every 10 MB + System.err.print('.'); + bytes_since_status = 0; + }*/ + } + } //end while + if(content.getLength() > 5) { + Text name = new Text(hash(content)); + seqFileWriter.append(name, content); + records ++; + } + totalBytes += bytes; + totalRecords += records; + long time = (System.currentTimeMillis() - startTime)/ 1000 + 1; + long kbSec = bytes / 1024 / time; + System.out.println(new java.util.Date()); + System.out.println("File " + file.getName() + " " + records+ " records, " + + bytes + " bytes in " + time+ " seconds (" +kbSec + " KB/sec)."); + in.close(); + seqFileWriter.close(); + outputStr.close(); + } +} diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..20683a0 --- /dev/null +++ b/Makefile @@ -0,0 +1,49 @@ +# Build file for CS61C Project 1 [Fall 2013] +# You should not need to edit this file if you're working on the inst machines. + +# This file requires GNU make and depends on paths on instruction machines. + +# If you are working on your own machine, you will need to edit the paths. + + +#### + +## Variables + +# Source files (java code). wildcard selects all files matching a pattern. +SOURCES = $(wildcard *.java) +# Output JAR file +TARGET = proj1.jar +# Extra JARs to have on the classpath when compiling. +CLASSPATH = /home/ff/cs61c/hadoop/hadoop-core.jar:/home/ff/cs61c/hadoop/lib/commons-cli.jar +# Compatibility flags to build for Java 6. Remove these flags if in the future +# the EC2 servers support Java 7 (or later versions) +COMPAT_FLAGS = -source 6 -target 6 +# javac command to use +JAVAC = javac -g $(COMPAT_FLAGS) -deprecation -cp $(CLASSPATH) +# jar command to use +JAR = jar + +## Make targets + +# General form is target: dependencies (targets or files), followed by +# commands to run to build the target from the dependencies. + +# Default target. +all: $(TARGET) + +$(TARGET): classes $(SOURCES) + $(JAVAC) -d classes $(SOURCES) + $(JAR) cf $(TARGET) -C classes . + +classes: + mkdir classes + +clean: + rm -rf classes $(TARGET) + +doublepair: classes + $(JAVAC) -d classes DoublePair.java + java -cp $(CLASSPATH):classes DoublePair + +.PHONY: clean all diff --git a/Proj1.java b/Proj1.java new file mode 100644 index 0000000..20f9009 --- /dev/null +++ b/Proj1.java @@ -0,0 +1,333 @@ +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.lang.Math; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.*; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.Mapper; +import org.apache.hadoop.mapreduce.Reducer; +import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; +import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; +import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; +import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; +import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; +import org.apache.hadoop.util.GenericOptionsParser; + +/* + * This is the skeleton for CS61c project 1, Fall 2013. + * + * Reminder: DO NOT SHARE CODE OR ALLOW ANOTHER STUDENT TO READ YOURS. + * EVEN FOR DEBUGGING. THIS MEANS YOU. + * + */ +public class Proj1{ + + /* + * Inputs is a set of (docID, document contents) pairs. + */ + public static class Map1 extends Mapper { + /** Regex pattern to find words (alphanumeric + _). */ + final static Pattern WORD_PATTERN = Pattern.compile("\\w+"); + + private String targetGram = null; + private int funcNum = 0; + + /* + * Setup gets called exactly once for each mapper, before map() gets called the first time. + * It's a good place to do configuration or setup that can be shared across many calls to map + */ + @Override + public void setup(Context context) { + targetGram = context.getConfiguration().get("targetWord").toLowerCase(); + try { + funcNum = Integer.parseInt(context.getConfiguration().get("funcNum")); + } catch (NumberFormatException e) { + /* Do nothing. */ + } + } + + @Override + public void map(WritableComparable docID, Text docContents, Context context) + throws IOException, InterruptedException { + Matcher matcher = WORD_PATTERN.matcher(docContents.toString()); + Func func = funcFromNum(funcNum); + + // YOUR CODE HERE + ArrayList targetWordIndices = targetWordIndex(docContents); + int counter = 0; + while (matcher.find()) { + String word = matcher.group().toLowerCase(); //the input word + Text wordOutput = new Text(word); + if (!word.equals(targetGram)) { //output word and distance if word is not targetGram + context.write(wordOutput, new DoublePair(1, func.f(distance(targetWordIndices, counter)))); + } + counter++; + + + } + } + + /* + * Finds the closest distance from word to targetGram. + */ + private double distance(ArrayList targetIndices, double currentIndex) { + double minDistance = Double.POSITIVE_INFINITY; + for (int i=0; i targetWordIndex(Text docContents) { + Matcher matcher = WORD_PATTERN.matcher(docContents.toString()); + ArrayList targetIndices = new ArrayList(); + double counter = 0; + while (matcher.find()) { + String word = matcher.group().toLowerCase(); + if (word.equals(targetGram)) { + targetIndices.add(counter); + } + counter++; + } + return targetIndices; + } + + /** Returns the Func corresponding to FUNCNUM*/ + private Func funcFromNum(int funcNum) { + Func func = null; + switch (funcNum) { + case 0: + func = new Func() { + public double f(double d) { + return d == Double.POSITIVE_INFINITY ? 0.0 : 1.0; + } + }; + break; + case 1: + func = new Func() { + public double f(double d) { + return d == Double.POSITIVE_INFINITY ? 0.0 : 1.0 + 1.0 / d; + } + }; + break; + case 2: + func = new Func() { + public double f(double d) { + return d == Double.POSITIVE_INFINITY ? 0.0 : 1.0 + Math.sqrt(d); + } + }; + break; + } + return func; + } + } + + /** Here's where you'll be implementing your combiner. It must be non-trivial for you to receive credit. */ + public static class Combine1 extends Reducer { + + @Override + public void reduce(Text key, Iterable values, + Context context) throws IOException, InterruptedException { + + // YOUR CODE HERE + double instances = 0; + double score=0; + for (DoublePair value: values) { + instances +=value.getDouble1(); + score += value.getDouble2(); + } + context.write(key, new DoublePair(instances, score)); + + + } + } + + + public static class Reduce1 extends Reducer { + @Override + public void reduce(Text key, Iterable values, + Context context) throws IOException, InterruptedException { + + // YOUR CODE HERE + double instances = 0; + double scores = 0; + for (DoublePair value : values) { + instances +=value.getDouble1(); + scores += value.getDouble2(); + } + context.write(key, new DoublePair(instances, scores)); + } + } + + public static class Map2 extends Mapper { + //maybe do something, maybe don't + @Override + public void map(Text key, DoublePair value, Context context) + throws IOException, InterruptedException { + double cRate=0; + double a = value.getDouble1(); + double s = value.getDouble2(); + if (s != 0) { + cRate = (-1)*s*(Math.pow(Math.log(s), 3))/a; //makes scores negative to output in correct order + } + context.write(new DoubleWritable(cRate), key); + + } + } + + public static class Reduce2 extends Reducer { + + int n = 0; + static int N_TO_OUTPUT = 100; + + /* + * Setup gets called exactly once for each reducer, before reduce() gets called the first time. + * It's a good place to do configuration or setup that can be shared across many calls to reduce + */ + @Override + protected void setup(Context c) { + n = 0; + } + + /* + * Your output should be a in the form of (DoubleWritable score, Text word) + * where score is the co-occurrence value for the word. Your output should be + * sorted from largest co-occurrence to smallest co-occurrence. + */ + @Override + public void reduce(DoubleWritable key, Iterable values, + Context context) throws IOException, InterruptedException { + + // YOUR CODE HERE + for (Text word : values) { + if (n targetIndices = new ArrayList(); + targetIndices.add(2.00); + targetIndices.add(4.00); + System.out.println("Test 1 Distance....."); + System.out.println(distance(targetIndices, 1.00)); + System.out.println("Should be 1"); + System.out.println(distance(targetIndices, 7)); + System.out.println("Should be 3"); + ArrayList targetIndices2 = new ArrayList(); + targetIndices2.add(2.00); + targetIndices2.add(5.00); + System.out.println(distance(targetIndices2, 3)); + System.out.println("Should be 1.0"); + System.out.println(distance(targetIndices2, 4)); + System.out.println("Should be 1.0"); + ArrayList targetIndices3 = new ArrayList(); + targetIndices3.add(0.00); + targetIndices3.add(7.00); + System.out.println(distance(targetIndices3, 3)); + System.out.println("Should be 3.0"); + System.out.println(distance(targetIndices3, 4)); + System.out.println("Should be 3.0"); + ArrayList targetIndices4 = new ArrayList(); + targetIndices4.add(2.00); + targetIndices4.add(3.00); + System.out.println("Test 4 Distance....."); + System.out.println(distance(targetIndices4, 4)); + System.out.println("Should be 1.0"); + System.out.println(distance(targetIndices4, 5)); + System.out.println("Should be 2.0"); + } + + + private static double distance(ArrayList targetIndices, double currentIndex) { + double minDistance = Double.POSITIVE_INFINITY; + for (int i=0; i + + + + +targetWord +monument + + + +funcNum +2 + + + diff --git a/ec2experience.txt b/ec2experience.txt new file mode 100644 index 0000000..710d4c5 --- /dev/null +++ b/ec2experience.txt @@ -0,0 +1,73 @@ +1. How long did each of the six runs take? How many mappers and how many reducers did you use? + + run 1: (freedom, 0) on the 2005 dataset with combiner off + time: 16 minutes, 52 seconds + mappers: 242 + reducers: 33 + + run 2: (freedom, 0) on the 2005 dataset with combiner on + time: 6 minutes, 34 seconds + mappers: 242 + reducers: 33 + + run 3: (capital, 0) on the 2006 dataset with combiner on + time: 15 mins, 20 seconds + mappers: 348 + reducers: 33 + processing rate: 0.01938 gb/s + + run 4: (capital, 0) on the 2006 dataset with combiner on + time: 8 mins, 58 seconds + mappers: 348 + reducers: 33 + processing rate: 0.03313 gb/s + + run 5: (landmark, 1) on the 2006 dataset with combiner on + time: 8 mins, 50 seconds + mappers: 348 + reducers: 33 + processing rate: 0.03363 gb/s + + run 6: (monument, 2) on the 2006 dataset with combiner on + time: 8 mins, 51 seconds + mappers: 348 + reducers: 33 + processing rate: 0.03357 gb/s + + +2. For the two runs with (freedom, 0), how much faster did your code run on the 5 workers with the combiner turned on than with the combiner turned off? Express your answer as a percentage. + +((16 minutes 52 seconds)-(6 minutes 34 seconds))/(6 minutes 34 seconds)=1.5685, so 156.85% faster + + +3. For the runs on the 2006 dataset, what was the median processing rate per GB (= 2^30 bytes) of input for the tests using 5 workers? Using 9 workers? + +The median processing rates are shown above in part 1. The median processing rate for 5 workers is 0.01938 gb/s, and the median processing rate for 9 workers is 0.03357 gb/s. + + +4. What was the percent speedup of running (capital, 0) with 9 workers over 5 workers? What is the maximum possible speedup, assuming your code is fully parallelizable? How well, in your opinion, does Hadoop parallelize your code? Justify your answer in 1-2 sentences. + +(0.03357-0.01938)/0.01938 = 0.7322 = 73.22% faster +Optimal is (9-5)/5 = 0.8 = 80% faster + +73.22/80 = 0.9153 = 91.53% efficient + +In my opinion, Hadoop parallelizes code pretty well and it parallelizes at a speedup that is 91.53% of the maximum speedup. + + +5. For a single run on the 2006 dataset, what was the price per GB processed on with 5 workers? With 9 workers? (Recall that an extra-large instance costs $0.58 per hour, rounded up to the nearest hour.) + + ($0.58)*(5 workers)*(1 hour) = $2.90 + GB Processed: (19,139,821,102 bytes)*(1/2^30 gb/bytes) = 17.82534 gb + $2.90/17.82534 gb = + $0.16 per gb + + ($0.58)*(9 workers)*(1 hour) = $5.22 + GB Processed: (19,141,786,065 bytes)*(1/2^30 gb/bytes) = 17.82718 gb + $5.33/17.82718 gb = + $0.30 per gb + + +6. How much total money did you use to complete this project? + +($0.58)*(5 workers)*(1 hour)*(3 jobs) + ($0.58)*(9 workers)*(1 hour)*(3 jobs) = $24.36 \ No newline at end of file diff --git a/landmark-1.txt b/landmark-1.txt new file mode 100644 index 0000000..b64cf24 --- /dev/null +++ b/landmark-1.txt @@ -0,0 +1,20 @@ +100.39530100764568 techese +92.03629080941323 somehiow +69.4081622852701 nixed +69.17064746022587 siiiiiiigh +64.17540253901382 prevenzione +59.92735916055491 journee +55.22177448564794 chiroparctor +54.187496433420826 loizidou +53.230190829283416 undaunting +48.2937701513063 deauthenticizes +42.33005145025361 wunnafulness +39.70436779185503 irala +33.60318835396598 gissi +31.324077477551374 bushido +30.95107412060516 troiseime +30.03294237677179 erhard +28.976262090783774 iyyengar +27.05044811969202 filartiga +26.445904902976714 scammark +25.487072226209133 peqa diff --git a/monument-2.txt b/monument-2.txt new file mode 100644 index 0000000..4455ce7 --- /dev/null +++ b/monument-2.txt @@ -0,0 +1,20 @@ +556.0618844224659 dachaus +443.1566899233314 buchenwalds +432.11171012313133 bandelier +372.51224507027706 ranelagh +291.8976881201187 aftrighted +248.99131468692107 eventsit +220.9959615432032 2714 +214.8765710028718 belsens +205.89149349373483 waalsdorper +194.2870172417444 3144 +171.9677863857934 94520 +170.84364974523712 pompeys +165.5002825746365 theexisting +143.48903370060955 rubot +139.86197928751682 _inscription +125.66456252541532 restauant +123.73918881704124 80132 +115.99254115323151 gravediggers +107.0751029806899 jennuars +107.0751029806899 26do diff --git a/onetest1/._SUCCESS.crc b/onetest1/._SUCCESS.crc new file mode 100644 index 0000000..3b7b044 Binary files /dev/null and b/onetest1/._SUCCESS.crc differ diff --git a/onetest1/.part-r-00000.crc b/onetest1/.part-r-00000.crc new file mode 100644 index 0000000..95fcac4 Binary files /dev/null and b/onetest1/.part-r-00000.crc differ diff --git a/onetest1/_SUCCESS b/onetest1/_SUCCESS new file mode 100644 index 0000000..e69de29 diff --git a/onetest1/part-r-00000 b/onetest1/part-r-00000 new file mode 100644 index 0000000..c5fb96d Binary files /dev/null and b/onetest1/part-r-00000 differ diff --git a/onetest2/._SUCCESS.crc b/onetest2/._SUCCESS.crc new file mode 100644 index 0000000..3b7b044 Binary files /dev/null and b/onetest2/._SUCCESS.crc differ diff --git a/onetest2/.part-r-00000.crc b/onetest2/.part-r-00000.crc new file mode 100644 index 0000000..ff30803 Binary files /dev/null and b/onetest2/.part-r-00000.crc differ diff --git a/onetest2/_SUCCESS b/onetest2/_SUCCESS new file mode 100644 index 0000000..e69de29 diff --git a/onetest2/part-r-00000 b/onetest2/part-r-00000 new file mode 100644 index 0000000..cbc9ce1 --- /dev/null +++ b/onetest2/part-r-00000 @@ -0,0 +1,100 @@ +1.7637883358888644 of +1.590984168291892 the +1.3271959397452826 or +0.09998888401272868 abridging +0.09998888401272868 speech +0.04187818243636804 to +0.013888745973675832 thereof +0.007272693876848162 exercise +0.004273494238076528 press +0.002136747119038264 free +0.0018382338216602087 prohibiting +9.523806251581638E-4 religion +5.555554605864092E-4 establishment +4.383080755555035E-4 peaceably +3.5186485067851544E-4 respecting +2.867383305336562E-4 assemble +2.5990889850540803E-4 right +2.1915403777775176E-4 an +1.977261404917842E-4 make +1.6683349422791388E-4 petition +1.4205553962122824E-4 congress +1.2195121663332598E-4 i +1.2195121663332598E-4 government +1.1111109211728183E-4 people +8.044081457037506E-5 redress +6.274509742066204E-5 grievances +4.7789721755609366E-5 law +4.7348482184300085E-5 no +1.7578090324830298E-5 for +1.69101722086786E-5 and +1.0546854194898178E-5 amendment +9.813734954583169E-6 shall +9.182736306098745E-6 a +0.0 committed +0.0 common +0.0 compelled +0.0 compensation +0.0 compulsory +0.0 confronted +0.0 certain +0.0 consent +0.0 constitution +0.0 construed +0.0 controversy +0.0 counsel +0.0 court +0.0 crime +0.0 criminal +0.0 cruel +0.0 danger +0.0 defence +0.0 delegated +0.0 deny +0.0 deprived +0.0 describing +0.0 disparage +0.0 district +0.0 dollars +0.0 due +0.0 effects +0.0 enjoy +0.0 enumeration +0.0 cause +0.0 examined +0.0 exceed +0.0 except +0.0 excessive +0.0 cases +0.0 fact +0.0 favor +0.0 fines +0.0 case +0.0 forces +0.0 capital +0.0 by +0.0 grand +0.0 but +0.0 have +0.0 held +0.0 him +0.0 himself +0.0 his +0.0 house +0.0 houses +0.0 being +0.0 ii +0.0 iii +0.0 impartial +0.0 imposed +0.0 in +0.0 indictment +0.0 infamous +0.0 inflicted +0.0 informed +0.0 infringed +0.0 issue +0.0 it +0.0 iv +0.0 ix +0.0 jeopardy diff --git a/proj1.jar b/proj1.jar new file mode 100644 index 0000000..ee1f63a Binary files /dev/null and b/proj1.jar differ