diff --git a/edu.usc.cssl.tacit.common.ui/src/edu/usc/cssl/tacit/common/ui/corpusmanagement/services/CMDataType.java b/edu.usc.cssl.tacit.common.ui/src/edu/usc/cssl/tacit/common/ui/corpusmanagement/services/CMDataType.java
index 654f24b6..9f1a3fda 100644
--- a/edu.usc.cssl.tacit.common.ui/src/edu/usc/cssl/tacit/common/ui/corpusmanagement/services/CMDataType.java
+++ b/edu.usc.cssl.tacit.common.ui/src/edu/usc/cssl/tacit/common/ui/corpusmanagement/services/CMDataType.java
@@ -1,7 +1,7 @@
package edu.usc.cssl.tacit.common.ui.corpusmanagement.services;
public enum CMDataType {
- JSON,REDDIT_JSON, TWITTER_JSON, STACKEXCHANGE_JSON, FRONTIER_JSON, TYPEPAD_JSON, CONGRESS_JSON, PLAIN_TEXT, XML, MICROSOFT_WORD, PRESIDENCY_JSON, HANSARD_JSON, IMPORTED_CSV, PLOSONE_JSON, GOVTRACK_JSON, LATIN_JSON;
+ JSON,REDDIT_JSON, TWITTER_JSON, STACKEXCHANGE_JSON, FRONTIER_JSON, TYPEPAD_JSON, CONGRESS_JSON, PLAIN_TEXT, XML, MICROSOFT_WORD, PRESIDENCY_JSON, HANSARD_JSON, IMPORTED_CSV, PLOSONE_JSON, GOVTRACK_JSON, LATIN_JSON, GUTENBERG_JSON;
public static CMDataType get(String dataType) {
if(dataType.equals("PLAIN_TEXT")) return CMDataType.PLAIN_TEXT;
@@ -20,6 +20,7 @@ public static CMDataType get(String dataType) {
else if(dataType.equals("PLOSONE_JSON")) return CMDataType.PLOSONE_JSON;
else if(dataType.equals("GOVTRACK_JSON")) return CMDataType.GOVTRACK_JSON;
else if(dataType.equals("LATIN_JSON")) return CMDataType.LATIN_JSON;
+ else if(dataType.equals("GUTENBERG_JSON")) return CMDataType.GUTENBERG_JSON;
return null;
}
}
diff --git a/edu.usc.cssl.tacit.crawlers.gutenberg.ui/META-INF/MANIFEST.MF b/edu.usc.cssl.tacit.crawlers.gutenberg.ui/META-INF/MANIFEST.MF
new file mode 100644
index 00000000..7504df20
--- /dev/null
+++ b/edu.usc.cssl.tacit.crawlers.gutenberg.ui/META-INF/MANIFEST.MF
@@ -0,0 +1,16 @@
+Manifest-Version: 1.0
+Bundle-ManifestVersion: 2
+Bundle-Name: Ui
+Bundle-SymbolicName: edu.usc.cssl.tacit.crawlers.gutenberg.ui;singleton:=true
+Bundle-Version: 1.0.0.qualifier
+Bundle-Activator: edu.usc.cssl.tacit.crawlers.gutenberg.ui.Activator
+Require-Bundle: org.eclipse.ui,
+ org.eclipse.core.runtime,
+ org.eclipse.ui.forms;bundle-version="3.6.200",
+ edu.usc.cssl.tacit.common.ui;bundle-version="1.0.0",
+ edu.usc.cssl.tacit.help;bundle-version="1.0.0",
+ org.eclipse.help;bundle-version="3.6.0",
+ edu.usc.cssl.tacit.common;bundle-version="1.0.0",
+ edu.usc.cssl.tacit.crawlers.gutenberg;bundle-version="1.0.0"
+Bundle-RequiredExecutionEnvironment: JavaSE-1.7
+Bundle-ActivationPolicy: lazy
diff --git a/edu.usc.cssl.tacit.crawlers.gutenberg.ui/build.properties b/edu.usc.cssl.tacit.crawlers.gutenberg.ui/build.properties
new file mode 100644
index 00000000..e9863e28
--- /dev/null
+++ b/edu.usc.cssl.tacit.crawlers.gutenberg.ui/build.properties
@@ -0,0 +1,5 @@
+source.. = src/
+output.. = bin/
+bin.includes = META-INF/,\
+ .,\
+ plugin.xml
diff --git a/edu.usc.cssl.tacit.crawlers.gutenberg.ui/icons/GutenbergCrawlerIcon.png b/edu.usc.cssl.tacit.crawlers.gutenberg.ui/icons/GutenbergCrawlerIcon.png
new file mode 100644
index 00000000..23163601
Binary files /dev/null and b/edu.usc.cssl.tacit.crawlers.gutenberg.ui/icons/GutenbergCrawlerIcon.png differ
diff --git a/edu.usc.cssl.tacit.crawlers.gutenberg.ui/icons/file_obj.gif b/edu.usc.cssl.tacit.crawlers.gutenberg.ui/icons/file_obj.gif
new file mode 100644
index 00000000..7ccc6a70
Binary files /dev/null and b/edu.usc.cssl.tacit.crawlers.gutenberg.ui/icons/file_obj.gif differ
diff --git a/edu.usc.cssl.tacit.crawlers.gutenberg.ui/icons/fldr_obj.gif b/edu.usc.cssl.tacit.crawlers.gutenberg.ui/icons/fldr_obj.gif
new file mode 100644
index 00000000..51e703b1
Binary files /dev/null and b/edu.usc.cssl.tacit.crawlers.gutenberg.ui/icons/fldr_obj.gif differ
diff --git a/edu.usc.cssl.tacit.crawlers.gutenberg.ui/icons/help_contents.gif b/edu.usc.cssl.tacit.crawlers.gutenberg.ui/icons/help_contents.gif
new file mode 100644
index 00000000..9d70301d
Binary files /dev/null and b/edu.usc.cssl.tacit.crawlers.gutenberg.ui/icons/help_contents.gif differ
diff --git a/edu.usc.cssl.tacit.crawlers.gutenberg.ui/icons/lrun_obj.gif b/edu.usc.cssl.tacit.crawlers.gutenberg.ui/icons/lrun_obj.gif
new file mode 100644
index 00000000..57f41022
Binary files /dev/null and b/edu.usc.cssl.tacit.crawlers.gutenberg.ui/icons/lrun_obj.gif differ
diff --git a/edu.usc.cssl.tacit.crawlers.gutenberg.ui/plugin.xml b/edu.usc.cssl.tacit.crawlers.gutenberg.ui/plugin.xml
new file mode 100644
index 00000000..1e289eae
--- /dev/null
+++ b/edu.usc.cssl.tacit.crawlers.gutenberg.ui/plugin.xml
@@ -0,0 +1,41 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/edu.usc.cssl.tacit.crawlers.gutenberg.ui/src/edu/usc/cssl/tacit/crawlers/gutenberg/ui/Activator.java b/edu.usc.cssl.tacit.crawlers.gutenberg.ui/src/edu/usc/cssl/tacit/crawlers/gutenberg/ui/Activator.java
new file mode 100644
index 00000000..fb8b0f05
--- /dev/null
+++ b/edu.usc.cssl.tacit.crawlers.gutenberg.ui/src/edu/usc/cssl/tacit/crawlers/gutenberg/ui/Activator.java
@@ -0,0 +1,50 @@
+package edu.usc.cssl.tacit.crawlers.gutenberg.ui;
+
+import org.eclipse.ui.plugin.AbstractUIPlugin;
+import org.osgi.framework.BundleContext;
+
+/**
+ * The activator class controls the plug-in life cycle
+ */
+public class Activator extends AbstractUIPlugin {
+
+ // The plug-in ID
+ public static final String PLUGIN_ID = "edu.usc.cssl.tacit.crawlers.gutenberg.ui"; //$NON-NLS-1$
+
+ // The shared instance
+ private static Activator plugin;
+
+ /**
+ * The constructor
+ */
+ public Activator() {
+ }
+
+ /*
+ * (non-Javadoc)
+ * @see org.eclipse.ui.plugin.AbstractUIPlugin#start(org.osgi.framework.BundleContext)
+ */
+ public void start(BundleContext context) throws Exception {
+ super.start(context);
+ plugin = this;
+ }
+
+ /*
+ * (non-Javadoc)
+ * @see org.eclipse.ui.plugin.AbstractUIPlugin#stop(org.osgi.framework.BundleContext)
+ */
+ public void stop(BundleContext context) throws Exception {
+ plugin = null;
+ super.stop(context);
+ }
+
+ /**
+ * Returns the shared instance
+ *
+ * @return the shared instance
+ */
+ public static Activator getDefault() {
+ return plugin;
+ }
+
+}
diff --git a/edu.usc.cssl.tacit.crawlers.gutenberg.ui/src/edu/usc/cssl/tacit/crawlers/gutenberg/ui/GutenbergCrawlerView.java b/edu.usc.cssl.tacit.crawlers.gutenberg.ui/src/edu/usc/cssl/tacit/crawlers/gutenberg/ui/GutenbergCrawlerView.java
new file mode 100644
index 00000000..e8aee9ab
--- /dev/null
+++ b/edu.usc.cssl.tacit.crawlers.gutenberg.ui/src/edu/usc/cssl/tacit/crawlers/gutenberg/ui/GutenbergCrawlerView.java
@@ -0,0 +1,1162 @@
+package edu.usc.cssl.tacit.crawlers.gutenberg.ui;
+
+import java.awt.Image;
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.text.DateFormat;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.eclipse.core.runtime.IProgressMonitor;
+import org.eclipse.core.runtime.IStatus;
+import org.eclipse.core.runtime.Status;
+import org.eclipse.core.runtime.jobs.IJobChangeEvent;
+import org.eclipse.core.runtime.jobs.Job;
+import org.eclipse.core.runtime.jobs.JobChangeAdapter;
+import org.eclipse.jface.action.Action;
+import org.eclipse.jface.action.IToolBarManager;
+import org.eclipse.jface.dialogs.IMessageProvider;
+import org.eclipse.jface.layout.GridDataFactory;
+import org.eclipse.jface.layout.GridLayoutFactory;
+import org.eclipse.jface.resource.ImageDescriptor;
+import org.eclipse.jface.viewers.ILabelProvider;
+import org.eclipse.jface.viewers.LabelProvider;
+import org.eclipse.jface.window.Window;
+import org.eclipse.swt.SWT;
+import org.eclipse.swt.custom.ScrolledComposite;
+import org.eclipse.swt.events.KeyEvent;
+import org.eclipse.swt.events.KeyListener;
+import org.eclipse.swt.events.SelectionAdapter;
+import org.eclipse.swt.events.SelectionEvent;
+import org.eclipse.swt.events.SelectionListener;
+import org.eclipse.swt.layout.GridData;
+import org.eclipse.swt.layout.GridLayout;
+import org.eclipse.swt.widgets.Button;
+import org.eclipse.swt.widgets.Combo;
+import org.eclipse.swt.widgets.Composite;
+import org.eclipse.swt.widgets.DirectoryDialog;
+import org.eclipse.swt.widgets.Display;
+import org.eclipse.swt.widgets.Group;
+import org.eclipse.swt.widgets.Label;
+import org.eclipse.swt.widgets.Table;
+import org.eclipse.swt.widgets.TableItem;
+import org.eclipse.swt.widgets.Text;
+import org.eclipse.swt.widgets.Tree;
+import org.eclipse.swt.widgets.TreeItem;
+import org.eclipse.ui.PlatformUI;
+import org.eclipse.ui.dialogs.ElementListSelectionDialog;
+import org.eclipse.ui.dialogs.PreferencesUtil;
+import org.eclipse.ui.forms.IFormColors;
+import org.eclipse.ui.forms.events.HyperlinkEvent;
+import org.eclipse.ui.forms.events.IHyperlinkListener;
+import org.eclipse.ui.forms.widgets.FormToolkit;
+import org.eclipse.ui.forms.widgets.Hyperlink;
+import org.eclipse.ui.forms.widgets.ScrolledForm;
+import org.eclipse.ui.forms.widgets.Section;
+import org.eclipse.ui.part.ViewPart;
+
+import edu.usc.cssl.tacit.crawlers.gutenberg.ui.internal.IGutenbergCrawlerViewConstants;
+import edu.usc.cssl.tacit.crawlers.gutenberg.ui.internal.GutenbergCrawlerViewImageRegistry;
+import edu.usc.cssl.tacit.common.Preprocessor;
+import edu.usc.cssl.tacit.common.ui.composite.from.TacitFormComposite;
+import edu.usc.cssl.tacit.common.ui.corpusmanagement.services.CorpusClass;
+import edu.usc.cssl.tacit.common.ui.internal.TargetLocationsGroup;
+import edu.usc.cssl.tacit.common.ui.outputdata.TableLayoutData;
+import edu.usc.cssl.tacit.common.ui.views.ConsoleView;
+import edu.usc.cssl.tacit.crawlers.gutenberg.services.GutenbergConstants;
+import edu.usc.cssl.tacit.crawlers.gutenberg.services.*;
+import edu.usc.cssl.tacit.common.ui.corpusmanagement.services.Corpus;
+import edu.usc.cssl.tacit.common.ui.corpusmanagement.services.CMDataType;
+import edu.usc.cssl.tacit.common.ui.corpusmanagement.services.CorpusClass;
+import edu.usc.cssl.tacit.common.ui.corpusmanagement.services.ManageCorpora;
+
+
+
+
+/**
+ * Naive Bayes Classifier View
+ */
+public class GutenbergCrawlerView extends ViewPart {
+ public static String ID = "edu.usc.cssl.tacit.crawlers.gutenberg.ui.view1";
+
+ private ScrolledForm form;
+ private FormToolkit toolkit;
+ private TableLayoutData classLayoutData;
+
+ private Composite searchComposite;
+ private Button searchButton;
+ private Button MPButton;
+ private Text keywordSearchText;
+
+ private Button bothButton;
+ private Button commonsButton;
+ private Button lordsButton;
+ private Button domainButton;
+
+ Combo domainList;
+ private Table subdomainTable;
+ private Button addSubdomainBtn;
+ private Button removeSubdomainButton;
+
+ private Button checkPages;
+ private Text pageText;
+ private Text corpusNameTxt;
+ private ElementListSelectionDialog listDialog;
+
+ private List selectedRepresentatives;
+
+ // Classification parameters
+
+ private Text outputPath;
+
+
+
+ private boolean canProceed = false;
+
+ protected Job job;
+
+ private boolean checkType = true;
+ boolean breakFlag = false;
+
+ private boolean isDomain = false;
+ private boolean isSearch = false;
+ private boolean isLatest = false;
+
+ final String[] domains = new String[]{"Animals","Children","Classics","Countries","Crime","Knowledge","Fiction","Fine Arts","General Works","Geography","History","Language and Literature","Law","Music","Periodicals","Psychology and Philosophy","Religion","Science","Social Sciences","Technology","Wars"};
+ //final String[] domains = new String[]{"Animals","Children","Countries","Crime","Knowledge"};
+ //final String[] domains = new String[]{"Animals","Children","Countries"};
+
+ public org.eclipse.swt.graphics.Image getTitleImage() {
+ return GutenbergCrawlerViewImageRegistry.getImageIconFactory().getImage(IGutenbergCrawlerViewConstants.IMAGE_GUTENBERG_OBJ);
+ }
+ @Override
+ public void createPartControl(Composite parent) {
+ // Creates toolkit and form
+ toolkit = createFormBodySection(parent, "GUTENBERG CRAWLER");
+ Section section = toolkit.createSection(form.getBody(), Section.TITLE_BAR | Section.EXPANDED);
+ GridDataFactory.fillDefaults().grab(true, false).span(3, 1).applyTo(section);
+ section.setExpanded(true);
+
+ // Create a composite to hold the other widgets
+ ScrolledComposite sc = new ScrolledComposite(section, SWT.H_SCROLL | SWT.V_SCROLL);
+ sc.setExpandHorizontal(true);
+ sc.setExpandVertical(true);
+ GridLayoutFactory.fillDefaults().numColumns(3).equalWidth(false).applyTo(sc);
+
+ // Creates an empty to create a empty space
+ TacitFormComposite.createEmptyRow(toolkit, sc);
+
+ // Create a composite that can hold the other widgets
+ Composite client = toolkit.createComposite(form.getBody());
+ GridLayoutFactory.fillDefaults().equalWidth(true).numColumns(1).applyTo(client);
+ GridDataFactory.fillDefaults().grab(true, false).span(1, 1).applyTo(client);
+ GridLayout layout = new GridLayout();// Layout creation
+ layout.numColumns = 2;
+
+ createCrawlInputParameters(toolkit, client);
+ // Create table layout to hold the input data
+ /*
+ classLayoutData = TacitFormComposite.createTableSection(client, toolkit, layout, "Input Details",
+ "Add Folder(s) or Corpus Classes to include in analysis.", true, false, true, true);
+ */
+
+
+ // Add run and help button on the toolbar
+ addButtonsToToolBar();
+ form.setImage(GutenbergCrawlerViewImageRegistry.getImageIconFactory().getImage(IGutenbergCrawlerViewConstants.IMAGE_GUTENBERG_OBJ));
+
+ }
+
+ /**
+ * Opens a "Browse" dialog
+ *
+ * @param browseBtn
+ * @return
+ */
+ protected String openBrowseDialog(Button browseBtn) {
+ DirectoryDialog dlg = new DirectoryDialog(browseBtn.getShell(), SWT.OPEN);
+ dlg.setText("Open");
+ String path = dlg.open();
+ return path;
+ }
+
+
+ /**
+ * Checks to ensure read permission of the given location
+ *
+ * @param location
+ * - Directory path
+ * @return
+ */
+ public String validateInputDirectory(String location) {
+ File locationFile = new File(location);
+ if (locationFile.canRead()) {
+ return null;
+ } else {
+ return "Classification Input Path : Permission Denied";
+ }
+ }
+
+ /**
+ * Checks to ensure read permission of the given location
+ *
+ * @param location
+ * - path
+ * @return
+ */
+ public String validateOutputDirectory(String location) {
+ File locationFile = new File(location);
+ if (locationFile.canWrite()) {
+ return null;
+ } else {
+ return "Output Path : Permission Denied";
+ }
+ }
+
+ /**
+ * Validation for "Output path"
+ *
+ * @param outputText
+ * @param errorMessage
+ * - error message to be displayed if required
+ * @return
+ */
+ protected boolean outputPathListener(Text outputText, String errorMessage) {
+ if (outputText.getText().isEmpty()) {
+ form.getMessageManager().addMessage("outputPath", errorMessage, null, IMessageProvider.ERROR);
+ return false;
+ }
+ File tempFile = new File(outputText.getText());
+ if (!tempFile.exists() || !tempFile.isDirectory()) {
+ form.getMessageManager().addMessage("outputPath", errorMessage, null, IMessageProvider.ERROR);
+ return false;
+ } else {
+ form.getMessageManager().removeMessage("outputPath");
+ String message = validateOutputDirectory(outputText.getText().toString());
+ if (null != message) {
+ form.getMessageManager().addMessage("outputPath", message, null, IMessageProvider.ERROR);
+ return false;
+ }
+ }
+ return true;
+ }
+
+ /*Creates the input parameters for the crawler
+ */
+
+ private void createCrawlInputParameters(final FormToolkit toolkit, final Composite parent) {
+
+ Section inputParamsSection = toolkit.createSection(parent, Section.TITLE_BAR | Section.EXPANDED | Section.DESCRIPTION);
+ GridDataFactory.fillDefaults().grab(true, false).span(1, 1).applyTo(inputParamsSection);
+ GridLayoutFactory.fillDefaults().numColumns(4).applyTo(inputParamsSection);
+ inputParamsSection.setText("Input Details");
+
+ ScrolledComposite sc = new ScrolledComposite(inputParamsSection, SWT.H_SCROLL | SWT.V_SCROLL | SWT.BORDER);
+ sc.setExpandHorizontal(true);
+ sc.setExpandVertical(true);
+ GridLayoutFactory.fillDefaults().numColumns(2).equalWidth(false).applyTo(sc);
+
+ Composite mainComposite = toolkit.createComposite(inputParamsSection);
+ sc.setContent(mainComposite);
+ GridDataFactory.fillDefaults().grab(true, true).applyTo(sc);
+ GridLayoutFactory.fillDefaults().numColumns(2).equalWidth(false).applyTo(mainComposite);
+ inputParamsSection.setClient(mainComposite);
+
+ searchComposite = toolkit.createComposite(mainComposite);
+ GridDataFactory.fillDefaults().grab(true, false).span(4, 0).applyTo(searchComposite);
+ GridLayoutFactory.fillDefaults().numColumns(1).equalWidth(false).applyTo(searchComposite);
+
+ Group searchGroup = new Group(searchComposite, SWT.NONE);
+ GridDataFactory.fillDefaults().grab(true, false).span(1, 0).applyTo(searchGroup);
+ GridLayoutFactory.fillDefaults().numColumns(3).equalWidth(false).applyTo(searchGroup);
+ searchGroup.setText("Search type:");
+
+ searchButton = new Button(searchGroup, SWT.RADIO);
+ searchButton.setText("Popular Search");
+ GridDataFactory.fillDefaults().grab(false, false).indent(4,10).span(1, 0).applyTo(searchButton);
+ searchButton.setSelection(true);
+
+ MPButton = new Button(searchGroup, SWT.RADIO);
+ MPButton.setText("Latest Search");
+ GridDataFactory.fillDefaults().grab(false, false).indent(4,10).span(1, 0).applyTo(MPButton);
+ MPButton.setSelection(false);
+
+ domainButton = new Button(searchGroup, SWT.RADIO);
+ domainButton.setText("Domain and Sub Domain Search");
+ GridDataFactory.fillDefaults().grab(false, false).indent(4,10).span(1, 0).applyTo(domainButton);
+ domainButton.setSelection(false);
+
+ /*
+ //********************************
+ Group tryComposite = new Group(mainComposite, SWT.NONE);
+ GridLayoutFactory.fillDefaults().numColumns(3).equalWidth(true).applyTo(tryComposite);
+ GridDataFactory.fillDefaults().grab(true, false).span(4, 0).indent(0,20).applyTo(tryComposite);
+ tryComposite.setText("Keyword search:");
+
+ searchButton = new Button(tryComposite, SWT.RADIO);
+ searchButton.setText("Popular Search");
+ GridDataFactory.fillDefaults().grab(false, false).indent(4,10).span(1, 0).applyTo(searchButton);
+ searchButton.setSelection(true);
+
+ MPButton = new Button(tryComposite, SWT.RADIO);
+ MPButton.setText("Latest Search");
+ GridDataFactory.fillDefaults().grab(false, false).indent(4,10).span(1, 0).applyTo(MPButton);
+ MPButton.setSelection(false);
+
+ domainButton = new Button(tryComposite, SWT.RADIO);
+ domainButton.setText("Domain and Sub Domain Search");
+ GridDataFactory.fillDefaults().grab(false, false).indent(4,10).span(1, 0).applyTo(domainButton);
+ domainButton.setSelection(false);
+ //********************************
+ *
+ */
+
+ Group searchFilterComposite = new Group(mainComposite, SWT.NONE);
+ GridLayoutFactory.fillDefaults().numColumns(3).equalWidth(true).applyTo(searchFilterComposite);
+ GridDataFactory.fillDefaults().grab(true, false).span(4, 0).indent(0,20).applyTo(searchFilterComposite);
+ searchFilterComposite.setText("Keyword search:");
+
+ final Label searchLabel = new Label(searchFilterComposite, SWT.NONE);
+ searchLabel.setText("Keyword:");
+ GridDataFactory.fillDefaults().grab(false, false).indent(4,10).span(1, 0).applyTo(searchLabel);
+
+ keywordSearchText = new Text(searchFilterComposite, SWT.BORDER);
+ GridDataFactory.fillDefaults().grab(true, false).indent(0,10).span(2, 0).applyTo(keywordSearchText);
+ keywordSearchText.setMessage("Enter a search term");
+
+ final Group domainFilterComposite = new Group(mainComposite, SWT.SHADOW_IN);
+ GridDataFactory.fillDefaults().grab(true, false).span(4, 0).applyTo(domainFilterComposite);
+ domainFilterComposite.setText("Select Domain and Sub Domain:");
+ GridLayoutFactory.fillDefaults().numColumns(3).applyTo(domainFilterComposite);
+
+ Label domain = new Label(domainFilterComposite, SWT.NONE);
+ domain.setText("Select Domain:");
+ GridDataFactory.fillDefaults().grab(false, false).span(1, 0).applyTo(domain);
+
+ domainList = new Combo(domainFilterComposite, SWT.FLAT | SWT.READ_ONLY);
+ GridDataFactory.fillDefaults().grab(true, false).span(2, 0).applyTo(domainList);
+ toolkit.adapt(domainList);
+ domainList.setItems(domains);
+ domainList.select(0);
+ domainList.setEnabled(false);
+
+ Label sortType = new Label(domainFilterComposite, SWT.NONE);
+ sortType.setText("Select sub-domains:");
+ subdomainTable = new Table(domainFilterComposite, SWT.BORDER | SWT.MULTI);
+ GridDataFactory.fillDefaults().grab(true, true).span(1, 3).hint(90, 50).applyTo(subdomainTable);
+ subdomainTable.setEnabled(false);
+
+ Composite buttonComp = new Composite(domainFilterComposite, SWT.NONE);
+ GridLayout btnLayout = new GridLayout();
+ btnLayout.marginWidth = btnLayout.marginHeight = 0;
+ btnLayout.makeColumnsEqualWidth = false;
+ buttonComp.setLayout(btnLayout);
+ buttonComp.setLayoutData(new GridData(GridData.FILL_VERTICAL));
+
+ addSubdomainBtn = new Button(buttonComp, SWT.PUSH); // $NON-NLS-1$
+ addSubdomainBtn.setText("Add...");
+ GridDataFactory.fillDefaults().grab(false, false).span(1, 1).applyTo(addSubdomainBtn);
+ addSubdomainBtn.setEnabled(false);
+
+ addSubdomainBtn.addSelectionListener(new SelectionAdapter() {
+ @Override
+ public void widgetSelected(SelectionEvent e) {
+
+ ILabelProvider lp = new ArrayLabelProvider();
+ listDialog = new ElementListSelectionDialog(addSubdomainBtn.getShell(), lp);
+ listDialog.setTitle("Select domain");
+ listDialog.setMessage("Type the name of the domain");
+ listDialog.setMultipleSelection(true);
+ listDialog.setElements(GutenbergConstants.sites.get(domainList.getSelectionIndex()));
+ if (listDialog.open() == Window.OK) {
+ updateTable(listDialog.getResult());
+ }
+ }
+
+ });
+
+ removeSubdomainButton = new Button(buttonComp, SWT.PUSH);
+ removeSubdomainButton.setText("Remove...");
+ GridDataFactory.fillDefaults().grab(false, false).span(1, 1).applyTo(removeSubdomainButton);
+ removeSubdomainButton.setEnabled(false);
+
+ removeSubdomainButton.addSelectionListener(new SelectionAdapter() {
+ @Override
+ public void widgetSelected(SelectionEvent e) {
+ for (TableItem item : subdomainTable.getSelection()) {
+ selectedRepresentatives.remove(item.getText());
+ item.dispose();
+ }
+ if (selectedRepresentatives.size() == 0) {
+ removeSubdomainButton.setEnabled(false);
+ }
+ }
+ });
+
+ Group limitGroup = new Group(mainComposite, SWT.SHADOW_IN);
+ GridDataFactory.fillDefaults().grab(true, false).span(4, 0).applyTo(limitGroup);
+ limitGroup.setText("Filter Results");
+ GridLayoutFactory.fillDefaults().numColumns(3).applyTo(limitGroup);
+
+ final Composite limitClient = new Composite(limitGroup, SWT.None);
+ GridDataFactory.fillDefaults().grab(true, false).span(1, 1).indent(10, 10).applyTo(limitClient);
+ GridLayoutFactory.fillDefaults().numColumns(2).equalWidth(false).applyTo(limitClient);
+
+ checkPages = new Button(limitClient, SWT.CHECK);
+ GridDataFactory.fillDefaults().grab(true, false).span(2, 0).applyTo(checkPages);
+ checkPages.setText("Limit Pages");
+
+ Label limitPages = new Label(limitClient, SWT.NONE);
+ limitPages.setText("Limit records per sub-domains:");
+ GridDataFactory.fillDefaults().grab(false, false).span(1, 0).applyTo(limitPages);
+ pageText = new Text(limitClient, SWT.BORDER);
+ GridDataFactory.fillDefaults().grab(true, false).span(1, 0).applyTo(pageText);
+ pageText.setEnabled(false);
+
+ checkPages.addSelectionListener(new SelectionListener() {
+ @Override
+ public void widgetSelected(SelectionEvent e) {
+ if(checkPages.getSelection())
+ pageText.setEnabled(true);
+ else
+ pageText.setEnabled(false);
+ }
+
+ @Override
+ public void widgetDefaultSelected(SelectionEvent e) {
+ // TODO Auto-generated method stub
+
+ }
+ });
+
+ searchButton.addSelectionListener(new SelectionListener() {
+ @Override
+ public void widgetSelected(SelectionEvent e) {
+ if(searchButton.getSelection())
+ {
+ keywordSearchText.setEnabled(true);
+ domainList.setEnabled(false);
+ subdomainTable.setEnabled(false);
+
+ }
+ else
+ {
+ keywordSearchText.setEnabled(false);
+ }
+ }
+
+ @Override
+ public void widgetDefaultSelected(SelectionEvent e) {
+ // TODO Auto-generated method stub
+
+ }
+ });
+
+ MPButton.addSelectionListener(new SelectionListener() {
+ @Override
+ public void widgetSelected(SelectionEvent e) {
+ if(MPButton.getSelection())
+ {
+ keywordSearchText.setEnabled(false);
+ domainList.setEnabled(false);
+ subdomainTable.setEnabled(false);
+
+ }
+ }
+
+ @Override
+ public void widgetDefaultSelected(SelectionEvent e) {
+ // TODO Auto-generated method stub
+
+ }
+ });
+
+ domainButton.addSelectionListener(new SelectionListener() {
+ @Override
+ public void widgetSelected(SelectionEvent e) {
+ if(domainButton.getSelection())
+ {
+ domainList.setEnabled(true);
+ subdomainTable.setEnabled(true);
+ addSubdomainBtn.setEnabled(true);
+ removeSubdomainButton.setEnabled(true);
+ }
+ }
+
+ @Override
+ public void widgetDefaultSelected(SelectionEvent e) {
+ // TODO Auto-generated method stub
+
+ }
+ });
+
+
+ TacitFormComposite.createEmptyRow(toolkit, limitGroup);
+
+ Composite client = toolkit.createComposite(form.getBody());
+ GridLayoutFactory.fillDefaults().equalWidth(true).numColumns(1).applyTo(client); // Align
+ // the
+ // composite
+ // section
+ // to
+ // one
+ // column
+ GridDataFactory.fillDefaults().grab(true, false).span(1, 1).applyTo(client);
+
+ TacitFormComposite.createEmptyRow(toolkit, client);
+ corpusNameTxt = TacitFormComposite.createCorpusSection(toolkit, client, form.getMessageManager());
+ TacitFormComposite.createEmptyRow(toolkit, client);
+ Button btnRun = TacitFormComposite.createRunButton(client, toolkit);
+
+
+ btnRun.addSelectionListener(new SelectionListener() {
+
+ @Override
+ public void widgetSelected(SelectionEvent e) {
+ final Job job = new Job("Gutenberg Crawler") {
+ String outputDir;
+ String corpusName;
+ Corpus corpus;
+ int pages;
+ boolean canProceed;
+ String query;
+ @Override
+ protected IStatus run(IProgressMonitor monitor) {
+
+ TacitFormComposite.setConsoleViewInFocus();
+ TacitFormComposite.updateStatusMessage(getViewSite(), null, null, form);
+ Display.getDefault().syncExec(new Runnable() {
+ @Override
+ public void run() {
+ if(checkPages.getSelection())
+ pages = Integer.parseInt(pageText.getText());
+ else
+ pages =-1;
+ corpusName = corpusNameTxt.getText();
+ isDomain = domainButton.getSelection();
+ isSearch = searchButton.getSelection();
+ isLatest = MPButton.getSelection();
+ query = keywordSearchText.getText();
+ outputDir = IGutenbergCrawlerViewConstants.DEFAULT_CORPUS_LOCATION + File.separator+ corpusName.trim();
+ if (!new File(outputDir).exists()) {
+ new File(outputDir).mkdirs();
+ }
+ }
+ });
+
+ int progressSize = 30;
+ monitor.beginTask("Running Gutenberg Crawler...", progressSize);
+ TacitFormComposite.writeConsoleHeaderBegining("Gutenberg Crawler started");
+ GutenbergMain objmain = new GutenbergMain();
+ SearchLatest objlatest = new SearchLatest();
+ SearchPopular objpopular = new SearchPopular();
+ monitor.subTask("Initializing...");
+ monitor.worked(10);
+ if (monitor.isCanceled())
+ handledCancelRequest("Crawling is Stopped");
+ corpus = new Corpus(corpusName, CMDataType.GUTENBERG_JSON);
+ if(isDomain){
+ for (final String domain : selectedRepresentatives) {
+ outputDir = IGutenbergCrawlerViewConstants.DEFAULT_CORPUS_LOCATION + File.separator + corpusName;
+ outputDir += File.separator + domain;
+ if (!new File(outputDir).exists()) {
+ new File(outputDir).mkdirs();
+ }
+
+ try {
+ monitor.subTask("Crawling...");
+ if (monitor.isCanceled())
+ return handledCancelRequest("Crawling is Stopped");
+ objmain.crawl(outputDir, domain, pages, monitor);
+ if (monitor.isCanceled())
+ return handledCancelRequest("Crawling is Stopped");
+ } catch (Exception e) {
+ return handleException(monitor, e, "Crawling failed. Provide valid data");
+ }
+ try {
+ Display.getDefault().syncExec(new Runnable() {
+
+ @Override
+ public void run() {
+
+ CorpusClass cc = new CorpusClass(domain, outputDir);
+ cc.setParent(corpus);
+ corpus.addClass(cc);
+
+ }
+ });
+ } catch (Exception e) {
+ e.printStackTrace();
+ return Status.CANCEL_STATUS;
+ }
+ }
+ }
+ if(isSearch)
+ {
+ System.out.println("I am inside search button");
+ outputDir = IGutenbergCrawlerViewConstants.DEFAULT_CORPUS_LOCATION + File.separator + corpusName;
+ outputDir += File.separator + query;
+ if (!new File(outputDir).exists()) {
+ new File(outputDir).mkdirs();
+ }
+
+ try {
+ monitor.subTask("Crawling...");
+ if (monitor.isCanceled())
+ return handledCancelRequest("Crawling is Stopped");
+ objpopular.popular(outputDir,pages,query, monitor);
+ if (monitor.isCanceled())
+ return handledCancelRequest("Crawling is Stopped");
+ } catch (Exception e) {
+ return handleException(monitor, e, "Crawling failed. Provide valid data");
+ }
+ try {
+ Display.getDefault().syncExec(new Runnable() {
+
+ @Override
+ public void run() {
+
+ CorpusClass cc = new CorpusClass(query, outputDir);
+ cc.setParent(corpus);
+ corpus.addClass(cc);
+
+ }
+ });
+ } catch (Exception e) {
+ e.printStackTrace();
+ return Status.CANCEL_STATUS;
+ }
+
+
+ }
+ if(isLatest)
+ {
+
+ System.out.println("I am inside latest button");
+ outputDir = IGutenbergCrawlerViewConstants.DEFAULT_CORPUS_LOCATION + File.separator + corpusName;
+ outputDir += File.separator + "latest";
+ if (!new File(outputDir).exists()) {
+ new File(outputDir).mkdirs();
+ }
+
+ try {
+ monitor.subTask("Crawling...");
+ if (monitor.isCanceled())
+ return handledCancelRequest("Crawling is Stopped");
+ objlatest.latest(outputDir,pages, monitor);
+ if (monitor.isCanceled())
+ return handledCancelRequest("Crawling is Stopped");
+ } catch (Exception e) {
+ return handleException(monitor, e, "Crawling failed. Provide valid data");
+ }
+ try {
+ Display.getDefault().syncExec(new Runnable() {
+
+ @Override
+ public void run() {
+
+ CorpusClass cc = new CorpusClass("latest", outputDir);
+ cc.setParent(corpus);
+ corpus.addClass(cc);
+
+ }
+ });
+ } catch (Exception e) {
+ e.printStackTrace();
+ return Status.CANCEL_STATUS;
+ }
+
+
+
+ }
+ ManageCorpora.saveCorpus(corpus);
+ if (monitor.isCanceled())
+ return handledCancelRequest("Crawling is Stopped");
+ ConsoleView.printlInConsoleln("Created Corpus: "+corpusName);
+ monitor.worked(100);
+ monitor.done();
+ return Status.OK_STATUS;
+
+ }
+ };
+ job.setUser(true);
+ boolean canProceed = canItProceed();
+ if (canProceed) {
+ job.schedule(); // schedule the job
+ job.addJobChangeListener(new JobChangeAdapter() {
+
+ public void done(IJobChangeEvent event) {
+ if (!event.getResult().isOK()) {
+ TacitFormComposite
+ .writeConsoleHeaderBegining("Error: Gutenberg Crawler ");
+ TacitFormComposite.updateStatusMessage(getViewSite(), "Crawling is stopped",
+ IStatus.INFO, form);
+
+ } else {
+ TacitFormComposite.updateStatusMessage(getViewSite(),
+ "Gutenberg Crawler completed", IStatus.OK, form);
+ ConsoleView.printlInConsoleln("Gutenberg Crawler completed successfully.");
+ TacitFormComposite
+ .writeConsoleHeaderBegining("Success: Gutenberg Crawler ");
+
+ }
+ }
+ });
+ }
+
+ }
+
+ @Override
+ public void widgetDefaultSelected(SelectionEvent e) {
+ // TODO Auto-generated method stub
+
+ }
+ });
+ }
+
+ static class ArrayLabelProvider extends LabelProvider {
+ @Override
+ public String getText(Object element) {
+ return (String) element;
+ }
+ }
+
+ public void updateTable(Object[] result) {
+ if (selectedRepresentatives == null) {
+ selectedRepresentatives = new ArrayList();
+ }
+
+ for (Object object : result) {
+ if (!selectedRepresentatives.contains((String) object))
+ selectedRepresentatives.add((String) object);
+ }
+
+ subdomainTable.removeAll();
+ for (String itemName : selectedRepresentatives) {
+ TableItem item = new TableItem(subdomainTable, 0);
+ item.setText(itemName);
+ if (!removeSubdomainButton.isEnabled()) {
+ removeSubdomainButton.setEnabled(true);
+ }
+ }
+
+ }
+
+
+ /**
+ * Adds "Classify" and "Help" buttons on the Naive Bayes Classifier form
+ */
+ private void addButtonsToToolBar() {
+ IToolBarManager mgr = form.getToolBarManager();
+ mgr.add(new Action() {
+ @Override
+ public ImageDescriptor getImageDescriptor() {
+ return (GutenbergCrawlerViewImageRegistry.getImageIconFactory().getImageDescriptor(IGutenbergCrawlerViewConstants.IMAGE_LRUN_OBJ));
+ }
+
+ @Override
+ public String getToolTipText() {
+ return "Crawl";
+ }
+
+ String outputDir;
+ String corpusName;
+ Corpus corpus;
+ int pages;
+ boolean canProceed;
+ String query;
+
+ @Override
+ public void run() {
+ TacitFormComposite.writeConsoleHeaderBegining("Gutenberg Crawler started");
+ TacitFormComposite.updateStatusMessage(getViewSite(), null, null, form);
+ job = new Job("Gutenberg Crawler") {
+ @Override
+ protected IStatus run(final IProgressMonitor monitor) {
+ TacitFormComposite.setConsoleViewInFocus();
+ TacitFormComposite.updateStatusMessage(getViewSite(), null, null, form);
+ monitor.beginTask("Running Gutenberg Crawler...", 100);
+ Date dateObj = new Date();
+ Display.getDefault().syncExec(new Runnable() {
+ @Override
+ public void run() {
+ if(checkPages.getSelection())
+ pages = Integer.parseInt(pageText.getText());
+ else
+ pages =-1;
+ corpusName = corpusNameTxt.getText();
+ isDomain = domainButton.getSelection();
+ isSearch = searchButton.getSelection();
+ isLatest = MPButton.getSelection();
+ query = keywordSearchText.getText();
+ outputDir = IGutenbergCrawlerViewConstants.DEFAULT_CORPUS_LOCATION + File.separator+ corpusName.trim();
+ if (!new File(outputDir).exists()) {
+ new File(outputDir).mkdirs();
+ }
+ }
+ });
+ //int progressSize = 0;
+ //if(domainButton.getSelection())
+ //{
+ //progressSize =selectedRepresentatives.size()*pages + 30;
+ //}
+ int progressSize = 30;
+ monitor.beginTask("Running Gutenberg Crawler...", progressSize);
+ TacitFormComposite.writeConsoleHeaderBegining("Gutenberg Crawler started");
+ GutenbergMain objmain = new GutenbergMain();
+ SearchLatest objlatest = new SearchLatest();
+ SearchPopular objpopular = new SearchPopular();
+ monitor.subTask("Initializing...");
+ monitor.worked(10);
+ if (monitor.isCanceled())
+ {
+ handledCancelRequest("Crawling is Stopped");
+ }
+ corpus = new Corpus(corpusName, CMDataType.GUTENBERG_JSON);
+ System.out.println("Name of corpus=============" + corpus);
+ if(isDomain)
+ {
+ System.out.println("I am inside domain button");
+ for (final String domain : selectedRepresentatives) {
+ System.out.println("Selected Representatives&&&&&&&&&&&&&&&&=" + selectedRepresentatives);
+ outputDir = IGutenbergCrawlerViewConstants.DEFAULT_CORPUS_LOCATION + File.separator + corpusName;
+ outputDir += File.separator + domain;
+ if (!new File(outputDir).exists()) {
+ new File(outputDir).mkdirs();
+ }
+
+ try {
+ monitor.subTask("Crawling...");
+ if (monitor.isCanceled())
+ return handledCancelRequest("Crawling is Stopped");
+ objmain.crawl(outputDir, domain, pages, monitor);
+ if (monitor.isCanceled())
+ return handledCancelRequest("Crawling is Stopped");
+ } catch (Exception e) {
+ return handleException(monitor, e, "Crawling failed. Provide valid data");
+ }
+ try {
+ Display.getDefault().syncExec(new Runnable() {
+
+ @Override
+ public void run() {
+
+ CorpusClass cc = new CorpusClass(domain, outputDir);
+ cc.setParent(corpus);
+ corpus.addClass(cc);
+
+ }
+ });
+ } catch (Exception e) {
+ e.printStackTrace();
+ return Status.CANCEL_STATUS;
+ }
+ }
+ }
+ if(isSearch)
+ {
+ System.out.println("I am inside search button");
+ outputDir = IGutenbergCrawlerViewConstants.DEFAULT_CORPUS_LOCATION + File.separator + corpusName;
+ outputDir += File.separator + query;
+ if (!new File(outputDir).exists()) {
+ new File(outputDir).mkdirs();
+ }
+
+ try {
+ monitor.subTask("Crawling...");
+ if (monitor.isCanceled())
+ return handledCancelRequest("Crawling is Stopped");
+ objpopular.popular(outputDir,pages,query, monitor);
+ if (monitor.isCanceled())
+ return handledCancelRequest("Crawling is Stopped");
+ } catch (Exception e) {
+ return handleException(monitor, e, "Crawling failed. Provide valid data");
+ }
+ try {
+ Display.getDefault().syncExec(new Runnable() {
+
+ @Override
+ public void run() {
+
+ CorpusClass cc = new CorpusClass(query, outputDir);
+ cc.setParent(corpus);
+ corpus.addClass(cc);
+
+ }
+ });
+ } catch (Exception e) {
+ e.printStackTrace();
+ return Status.CANCEL_STATUS;
+ }
+
+
+ }
+ if(isLatest)
+ {
+
+ System.out.println("I am inside latest button");
+ outputDir = IGutenbergCrawlerViewConstants.DEFAULT_CORPUS_LOCATION + File.separator + corpusName;
+ outputDir += File.separator + "latest";
+ if (!new File(outputDir).exists()) {
+ new File(outputDir).mkdirs();
+ }
+
+ try {
+ monitor.subTask("Crawling...");
+ if (monitor.isCanceled())
+ return handledCancelRequest("Crawling is Stopped");
+ objlatest.latest(outputDir,pages, monitor);
+ if (monitor.isCanceled())
+ return handledCancelRequest("Crawling is Stopped");
+ } catch (Exception e) {
+ return handleException(monitor, e, "Crawling failed. Provide valid data");
+ }
+ try {
+ Display.getDefault().syncExec(new Runnable() {
+
+ @Override
+ public void run() {
+
+ CorpusClass cc = new CorpusClass("latest", outputDir);
+ cc.setParent(corpus);
+ corpus.addClass(cc);
+
+ }
+ });
+ } catch (Exception e) {
+ e.printStackTrace();
+ return Status.CANCEL_STATUS;
+ }
+
+
+
+ }
+ ManageCorpora.saveCorpus(corpus);
+ if (monitor.isCanceled())
+ return handledCancelRequest("Crawling is Stopped");
+ ConsoleView.printlInConsoleln("Created Corpus: "+corpusName);
+ monitor.worked(100);
+ monitor.done();
+ return Status.OK_STATUS;
+ }
+ };
+ job.setUser(true);
+ canProceed = canItProceed();
+ if (canProceed) {
+ job.schedule(); // schedule the job
+ job.addJobChangeListener(new JobChangeAdapter() {
+ public void done(IJobChangeEvent event) {
+ if (!event.getResult().isOK()) {
+ TacitFormComposite
+ .writeConsoleHeaderBegining("Error: Gutenberg Crawler");
+ } else {
+ TacitFormComposite.updateStatusMessage(getViewSite(),
+ "Gutenberg Crawler completed", IStatus.OK, form);
+ ConsoleView.printlInConsoleln("Gutenberg Crawler completed successfully.");
+ TacitFormComposite
+ .writeConsoleHeaderBegining("Success: Gutenberg Crawler ");
+
+ }
+ }
+ });
+ }
+ };
+
+
+
+
+ });
+
+ Action helpAction = new Action() {
+ @Override
+ public ImageDescriptor getImageDescriptor() {
+ return (GutenbergCrawlerViewImageRegistry.getImageIconFactory().getImageDescriptor(IGutenbergCrawlerViewConstants.IMAGE_HELP_CO));
+ }
+
+ @Override
+ public String getToolTipText() {
+ return "Help";
+ }
+
+ @Override
+ public void run() {
+ PlatformUI.getWorkbench().getHelpSystem()
+ .displayHelp("edu.usc.cssl.tacit.classify.naivebayes.ui.naivebayes");
+ };
+ };
+ mgr.add(helpAction);
+ PlatformUI.getWorkbench().getHelpSystem().setHelp(helpAction,
+ "edu.usc.cssl.tacit.classify.naivebayes.ui.naivebayes");
+ PlatformUI.getWorkbench().getHelpSystem().setHelp(form, "edu.usc.cssl.tacit.classify.naivebayes.ui.naivebayes");
+ form.getToolBarManager().update(true);
+ }
+
+ /**
+ * Handles cancel request by sending appropriate message to UI
+ *
+ * @param message
+ * @return
+ */
+ private IStatus handledCancelRequest(String message) {
+ TacitFormComposite.updateStatusMessage(getViewSite(), message, IStatus.ERROR, form);
+ ConsoleView.printlInConsoleln("Gutenberg Crawler cancelled.");
+ return Status.CANCEL_STATUS;
+
+ }
+
+ /**
+ * Validates the input form to ensure correctness
+ *
+ * @param classPaths
+ * @return
+ */
+ private boolean canItProceed() {
+
+ form.getMessageManager().removeAllMessages();
+ Boolean isDomaincheck;
+ Boolean isSearchcheck;
+
+ isDomaincheck = domainButton.getSelection();
+ System.out.println("value of ------------------------" + isDomaincheck);
+ if(isDomaincheck)
+ {
+ System.out.println("I am inside");
+ try{
+ if(selectedRepresentatives.isEmpty()){
+ form.getMessageManager().addMessage("DomainError", "Enter atleast one sub domain name", null,
+ IMessageProvider.ERROR);
+ return false;
+ }else{
+ form.getMessageManager().removeMessage("DomainError");
+ }
+ }catch(Exception e){
+ form.getMessageManager().addMessage("DomainError", "Enter atleast one sub domain name", null,
+ IMessageProvider.ERROR);
+ return false;
+ }
+ }
+ isSearchcheck = searchButton.getSelection();
+ if(isSearchcheck)
+ {
+ try{
+ String query = keywordSearchText.getText();
+ if(query == null || query.isEmpty())
+ {
+ form.getMessageManager().addMessage("keyword", "Enter the keyword to be crawled", null,
+ IMessageProvider.ERROR);
+ return false;
+ }else
+ form.getMessageManager().removeMessage("pageLimit");
+ }catch (Exception e) {
+ form.getMessageManager().addMessage("keyword", "Enter the keyword to be crawled", null,
+ IMessageProvider.ERROR);
+ return false;
+ }
+ }
+ try {
+ int pages = Integer.parseInt(pageText.getText());
+ if (pages < 1) {
+ form.getMessageManager().addMessage("pageLimit", "Enter the number of pages to be crawled", null,
+ IMessageProvider.ERROR);
+ return false;
+ } else
+ form.getMessageManager().removeMessage("pageLimit");
+ } catch (Exception e) {
+ form.getMessageManager().addMessage("pageLimit", "Enter the number of pages to be crawled", null,
+ IMessageProvider.ERROR);
+ return false;
+ }
+
+ // Validate corpus name
+ String corpusName = corpusNameTxt.getText();
+ if (null == corpusName || corpusName.isEmpty()) {
+ form.getMessageManager().addMessage("corpusName", "Provide corpus name", null, IMessageProvider.ERROR);
+ return false;
+ } else {
+ String outputDir = IGutenbergCrawlerViewConstants.DEFAULT_CORPUS_LOCATION + File.separator + corpusName;
+ if (new File(outputDir).exists()) {
+ form.getMessageManager().addMessage("corpusName", "Corpus already exists", null,
+ IMessageProvider.ERROR);
+ return false;
+ } else {
+ form.getMessageManager().removeMessage("corpusName");
+ return true;
+ }
+ }
+
+
+ }
+
+ /**
+ * Maps each class to its selected files
+ *
+ * @param classLayoutData
+ * @param classPaths
+ */
+ protected void consolidateSelectedFiles(TableLayoutData classLayoutData, Map> classPaths) {
+ Tree tree = classLayoutData.getTree();
+ for (int i = 0; i < tree.getItemCount(); i++) {
+ TreeItem temp = tree.getItem(i);
+ if (temp.getChecked()) {
+ classPaths.put(temp.getData().toString(), classLayoutData.getSelectedItems(temp));
+ }
+ }
+ }
+
+ /**
+ * Function to be called incase of exception
+ *
+ * @param monitor
+ * @param e
+ * @param message
+ * @return
+ */
+ private IStatus handleException(IProgressMonitor monitor, Exception e, String message) {
+ monitor.done();
+ System.out.println(message);
+ e.printStackTrace();
+ TacitFormComposite.updateStatusMessage(getViewSite(), message + e.getMessage(), IStatus.ERROR, form);
+ return Status.CANCEL_STATUS;
+ }
+
+ @Override
+ public void setFocus() {
+ form.setFocus();
+ }
+
+ /**
+ * Output file creation with statistics
+ *
+ * @param location
+ * @param title
+ * @param dateObj
+ * @param perf
+ * @param kValue
+ * @param monitor
+ */
+
+
+ /**
+ *
+ * @param parent
+ * @param title
+ * @return - Creates a form body section for Naive Bayes Classifier
+ */
+ private FormToolkit createFormBodySection(Composite parent, String title) {
+ // Every interface requires a toolkit(Display) and form to store the
+ // components
+ FormToolkit toolkit = new FormToolkit(parent.getDisplay());
+ form = toolkit.createScrolledForm(parent);
+ toolkit.decorateFormHeading(form.getForm());
+ form.setText(title);
+ GridLayoutFactory.fillDefaults().numColumns(1).equalWidth(true).applyTo(form.getBody());
+ return toolkit;
+ }
+
+}
diff --git a/edu.usc.cssl.tacit.crawlers.gutenberg.ui/src/edu/usc/cssl/tacit/crawlers/gutenberg/ui/handlers/GutenbergCrawlerViewHandler.java b/edu.usc.cssl.tacit.crawlers.gutenberg.ui/src/edu/usc/cssl/tacit/crawlers/gutenberg/ui/handlers/GutenbergCrawlerViewHandler.java
new file mode 100644
index 00000000..ce83a31d
--- /dev/null
+++ b/edu.usc.cssl.tacit.crawlers.gutenberg.ui/src/edu/usc/cssl/tacit/crawlers/gutenberg/ui/handlers/GutenbergCrawlerViewHandler.java
@@ -0,0 +1,24 @@
+package edu.usc.cssl.tacit.crawlers.gutenberg.ui.handlers;
+
+import org.eclipse.core.commands.AbstractHandler;
+import org.eclipse.core.commands.ExecutionEvent;
+import org.eclipse.core.commands.ExecutionException;
+import org.eclipse.ui.PartInitException;
+import org.eclipse.ui.handlers.HandlerUtil;
+
+import edu.usc.cssl.tacit.crawlers.gutenberg.ui.GutenbergCrawlerView;
+
+public class GutenbergCrawlerViewHandler extends AbstractHandler{
+
+ @Override
+ public Object execute(ExecutionEvent event) throws ExecutionException {
+ try {
+ HandlerUtil.getActiveWorkbenchWindowChecked(event).
+ getActivePage().showView(GutenbergCrawlerView.ID);
+
+ } catch (PartInitException e) {
+ e.printStackTrace();
+ }
+ return null;
+ }
+}
diff --git a/edu.usc.cssl.tacit.crawlers.gutenberg.ui/src/edu/usc/cssl/tacit/crawlers/gutenberg/ui/internal/GutenbergCrawlerViewImageRegistry.java b/edu.usc.cssl.tacit.crawlers.gutenberg.ui/src/edu/usc/cssl/tacit/crawlers/gutenberg/ui/internal/GutenbergCrawlerViewImageRegistry.java
new file mode 100644
index 00000000..330eb8ab
--- /dev/null
+++ b/edu.usc.cssl.tacit.crawlers.gutenberg.ui/src/edu/usc/cssl/tacit/crawlers/gutenberg/ui/internal/GutenbergCrawlerViewImageRegistry.java
@@ -0,0 +1,46 @@
+package edu.usc.cssl.tacit.crawlers.gutenberg.ui.internal;
+
+import org.eclipse.jface.resource.ImageDescriptor;
+import org.eclipse.jface.resource.ImageRegistry;
+import org.eclipse.swt.graphics.Image;
+
+public class GutenbergCrawlerViewImageRegistry{
+
+ ImageRegistry ir = new ImageRegistry();
+ static GutenbergCrawlerViewImageRegistry imgIcon;
+
+ //Returns the descriptor associated with the given key in this registry, or null if none.
+ public ImageDescriptor getImageDescriptor(String key) {
+ return ir.getDescriptor(key);
+ }
+
+ private GutenbergCrawlerViewImageRegistry(){
+
+
+ ir.put(IGutenbergCrawlerViewConstants .IMAGE_LRUN_OBJ, ImageDescriptor
+ .createFromFile(GutenbergCrawlerViewImageRegistry.class, "/icons/lrun_obj.gif"));
+
+ ir.put(IGutenbergCrawlerViewConstants .IMAGE_HELP_CO, ImageDescriptor
+ .createFromFile(GutenbergCrawlerViewImageRegistry.class, "/icons/help_contents.gif"));
+
+ ir.put(IGutenbergCrawlerViewConstants .IMAGE_GUTENBERG_OBJ, ImageDescriptor
+ .createFromFile(GutenbergCrawlerViewImageRegistry.class, "/icons/GutenbergCrawlerIcon.png"));
+ }
+
+ public static GutenbergCrawlerViewImageRegistry getImageIconFactory() {
+ if (imgIcon == null) {
+ imgIcon = new GutenbergCrawlerViewImageRegistry();
+ }
+ return imgIcon;
+
+ }
+
+
+ public Image getImage(String imageName) {
+ return ir.get(imageName);
+ }
+
+}
+
+
+/* This file handles creation of images from the gif files we provide and allocating OS resources for image to get displayed */
\ No newline at end of file
diff --git a/edu.usc.cssl.tacit.crawlers.gutenberg.ui/src/edu/usc/cssl/tacit/crawlers/gutenberg/ui/internal/IGutenbergCrawlerViewConstants.java b/edu.usc.cssl.tacit.crawlers.gutenberg.ui/src/edu/usc/cssl/tacit/crawlers/gutenberg/ui/internal/IGutenbergCrawlerViewConstants.java
new file mode 100644
index 00000000..7c0b5f37
--- /dev/null
+++ b/edu.usc.cssl.tacit.crawlers.gutenberg.ui/src/edu/usc/cssl/tacit/crawlers/gutenberg/ui/internal/IGutenbergCrawlerViewConstants.java
@@ -0,0 +1,11 @@
+package edu.usc.cssl.tacit.crawlers.gutenberg.ui.internal;
+
+public interface IGutenbergCrawlerViewConstants {
+ public static final String IMAGE_LRUN_OBJ = "lrun_obj";
+ public static final String IMAGE_HELP_CO = "help_co";
+ public static final String IMAGE_FILE_OBJ = "File_obj";
+ public static final String IMAGE_FOLDER_OBJ = "Foler_obj";
+ public static final String IMAGE_GUTENBERG_OBJ = "gutenberg_crawler";
+ String DEFAULT_CORPUS_LOCATION = System.getProperty("user.dir") + System.getProperty("file.separator") + "json_corpuses" + System.getProperty("file.separator") + "gutenberg";
+}
+
diff --git a/edu.usc.cssl.tacit.crawlers.gutenberg/META-INF/MANIFEST.MF b/edu.usc.cssl.tacit.crawlers.gutenberg/META-INF/MANIFEST.MF
new file mode 100644
index 00000000..a1b592b1
--- /dev/null
+++ b/edu.usc.cssl.tacit.crawlers.gutenberg/META-INF/MANIFEST.MF
@@ -0,0 +1,15 @@
+Manifest-Version: 1.0
+Bundle-ManifestVersion: 2
+Bundle-Name: Gutenberg
+Bundle-SymbolicName: edu.usc.cssl.tacit.crawlers.gutenberg
+Bundle-Version: 1.0.0.qualifier
+Bundle-Activator: edu.usc.cssl.tacit.crawlers.gutenberg.Activator
+Require-Bundle: org.eclipse.ui,
+ org.eclipse.core.runtime,
+ org.jsoup;bundle-version="1.7.2",
+ edu.usc.cssl.tacit.common.ui;bundle-version="1.0.0"
+Bundle-RequiredExecutionEnvironment: JavaSE-1.6
+Bundle-ActivationPolicy: lazy
+Export-Package: edu.usc.cssl.tacit.crawlers.gutenberg.services
+Bundle-ClassPath: jackson-core-2.5.0.jar,
+ .
diff --git a/edu.usc.cssl.tacit.crawlers.gutenberg/build.properties b/edu.usc.cssl.tacit.crawlers.gutenberg/build.properties
new file mode 100644
index 00000000..1ce3283d
--- /dev/null
+++ b/edu.usc.cssl.tacit.crawlers.gutenberg/build.properties
@@ -0,0 +1,5 @@
+source.. = src/
+output.. = bin/
+bin.includes = META-INF/,\
+ .,\
+ jackson-core-2.5.0.jar
diff --git a/edu.usc.cssl.tacit.crawlers.gutenberg/jackson-core-2.5.0.jar b/edu.usc.cssl.tacit.crawlers.gutenberg/jackson-core-2.5.0.jar
new file mode 100644
index 00000000..e8ca122f
Binary files /dev/null and b/edu.usc.cssl.tacit.crawlers.gutenberg/jackson-core-2.5.0.jar differ
diff --git a/edu.usc.cssl.tacit.crawlers.gutenberg/src/edu/usc/cssl/tacit/crawlers/gutenberg/Activator.java b/edu.usc.cssl.tacit.crawlers.gutenberg/src/edu/usc/cssl/tacit/crawlers/gutenberg/Activator.java
new file mode 100644
index 00000000..345ade72
--- /dev/null
+++ b/edu.usc.cssl.tacit.crawlers.gutenberg/src/edu/usc/cssl/tacit/crawlers/gutenberg/Activator.java
@@ -0,0 +1,50 @@
+package edu.usc.cssl.tacit.crawlers.gutenberg;
+
+import org.eclipse.ui.plugin.AbstractUIPlugin;
+import org.osgi.framework.BundleContext;
+
+/**
+ * The activator class controls the plug-in life cycle
+ */
+public class Activator extends AbstractUIPlugin {
+
+ // The plug-in ID
+ public static final String PLUGIN_ID = "edu.usc.cssl.tacit.crawlers.gutenberg"; //$NON-NLS-1$
+
+ // The shared instance
+ private static Activator plugin;
+
+ /**
+ * The constructor
+ */
+ public Activator() {
+ }
+
+ /*
+ * (non-Javadoc)
+ * @see org.eclipse.ui.plugin.AbstractUIPlugin#start(org.osgi.framework.BundleContext)
+ */
+ public void start(BundleContext context) throws Exception {
+ super.start(context);
+ plugin = this;
+ }
+
+ /*
+ * (non-Javadoc)
+ * @see org.eclipse.ui.plugin.AbstractUIPlugin#stop(org.osgi.framework.BundleContext)
+ */
+ public void stop(BundleContext context) throws Exception {
+ plugin = null;
+ super.stop(context);
+ }
+
+ /**
+ * Returns the shared instance
+ *
+ * @return the shared instance
+ */
+ public static Activator getDefault() {
+ return plugin;
+ }
+
+}
diff --git a/edu.usc.cssl.tacit.crawlers.gutenberg/src/edu/usc/cssl/tacit/crawlers/gutenberg/services/GutenbergConstants.java b/edu.usc.cssl.tacit.crawlers.gutenberg/src/edu/usc/cssl/tacit/crawlers/gutenberg/services/GutenbergConstants.java
new file mode 100644
index 00000000..faa103ad
--- /dev/null
+++ b/edu.usc.cssl.tacit.crawlers.gutenberg/src/edu/usc/cssl/tacit/crawlers/gutenberg/services/GutenbergConstants.java
@@ -0,0 +1,538 @@
+package edu.usc.cssl.tacit.crawlers.gutenberg.services;
+
+import java.util.HashMap;
+
+public class GutenbergConstants {
+ public static HashMap site2Link = new HashMap();
+ public static HashMap sites = new HashMap();
+ static String animals[] = new String[]{"Birds",
+ "Insects",
+ "Mammals",
+ "Reptiles and Amphibians",
+ "Trapping"};
+
+ static String children[] = new String[]{"Anthologies",
+ "Biography",
+ "Book Series",
+ "Verse",
+ "Christmas",
+ "Fiction",
+ "History",
+ "Instructional Books",
+ "Literature",
+ "Myths and Fairy Tales",
+ "Religion",
+ "School Stories"
+
+ };
+
+ static String classics[] = new String[]{"Classics"};
+
+ static String Countries[] =new String[]{"Africa",
+ "Argentina",
+ "Australia",
+ "Bulgaria",
+ "Canada",
+ "Czech",
+ "Egypt",
+ "France",
+ "Germany",
+ "Greece",
+ "India",
+ "Italy",
+ "New Zealand",
+ "Norway",
+ "South Africa",
+ "South America",
+ "Travel",
+ "United Kingdom",
+ "United States"
+ };
+ static String Crime[] = new String[]{"Crime Fiction",
+ "Crime Non Fiction",
+ "Detective Fiction",
+ "Mystery Fiction"
+ };
+
+ static String Knowledge[] = new String[]{"Education",
+ "Language Education"
+ };
+
+ static String fiction[] = new String[]{"Adventure",
+ "Children's Fiction",
+ "Crime Fiction",
+ "Detective Fiction",
+ "Erotic Fiction",
+ "Fantasy",
+ "General Fiction",
+ "Gothic Fiction",
+ "Historical Fiction",
+ "Horror",
+ "Humor",
+ "Movie Books",
+ "Mystery Fiction",
+ "Precursors of Science Fiction",
+ "Romantic Fiction",
+ "School Stories",
+ "Science Fiction",
+ "Western"
+ };
+
+ static String fine_arts[] = new String[]{"Architecture",
+ "Art"
+ };
+
+ static String general_works[] = new String[]{"Children's Periodicals:Dew Drops",
+ "Children's Periodicals:The Girls Own Paper",
+ "Children's Periodicals:Golden Days for Boys and Girls",
+ "Children's Periodicals:The Great Round World And What Is Going On In It",
+ "Children's Periodicals:The Nursery",
+ "Children's Periodicals:St. Nicholas Magazine for Boys and Girls",
+ "Reference"
+ };
+
+ static String geography[] = new String[]{"Anthropology",
+ "CIA World Factbooks",
+ "Folklore",
+ "Maps and Cartography",
+ "Women's Travel Journals"
+ };
+
+ static String history[] = new String[]{"Archaeology",
+ "Biographies",
+ "Children's History",
+ "Classical Antiquity"
+ };
+
+ static String language_and_literature[] = new String[]{"Esperanto",
+ "German Language Books",
+ "Language Education",
+ "Plays"
+ };
+
+ static String law[] = new String[]{"British Law",
+ "Canon Law",
+ "Noteworthy Trials",
+ "United States Law"
+ };
+
+ static String music[] = new String[]{"Music",
+ "Opera"
+ };
+
+ static String periodicals[] = new String[]{"Ainslee's",
+ "The Aldine",
+ "The American Architect and Building News",
+ "The American Journal of Archaeology",
+ "The American Missionary",
+ "The American Quarterly Review",
+ "The Arena",
+ "The Argosy",
+ "Armour's Monthly Cook Book",
+ "Astounding Stories",
+ "The Atlantic Monthly",
+ "The Baptist Magazine",
+ "Barnavännen",
+ "The Bay State Monthly",
+ "Bird-Lore",
+ "Birds, Illustrated by Color Photography",
+ "Blackwood's Edinburgh Magazine",
+ "The Botanical Magazine",
+ "The Brochure Series of Architectural Illustration",
+ "Buchanan's Journal of Man",
+ "Bulletin of Lille", //French.Page requires translation
+ "The Catholic World",
+ "Celtic Magazine",
+ "Chambers's Edinburgh Journal",
+ "The Christian Foundation",
+ "The Church of England Magazine",
+ "The Contemporary Review",
+ "Continental Monthly",
+ "Current History",
+ "De Aarde en haar Volken", //Other language.Page requires translation
+ "Donahoe's Magazine",
+ "The Economist",
+ "The Esperantist",
+ "The Galaxy",
+ "Garden and Forest",
+ "Godey's Lady's Book",
+ "Graham's Magazine",
+ "Harper's New Monthly Magazine",
+ "Harper's Young People",
+ "The Idler",
+ "The Illustrated War News",
+ "The International Magazine of Literature, Art, and Science",
+ "The Irish Ecclesiastical Record",
+ "The Irish Penny Journal",
+ "Journal of Entomology and Zoology",
+ "The Journal of Negro History",
+ "The Knickerbocker",
+ "L'Illustration",
+ "Lippincott's Magazine",
+ "Little Folks",
+ "London Medical Gazette",
+ "The Mayflower",
+ "McClure's Magazine",
+ "The Menorah Journal",
+ "The Mentor",
+ "The Mirror of Literature, Amusement, and Instruction",
+ "The Mirror of Taste, and Dramatic Censor",
+ "Mother Earth",
+ "Mrs Whittelsey's Magazine for Mothers and Daughters",
+ "The National Preacher",
+ "The North American Medical and Surgical Journal",
+ "Northern Nut Growers Association",
+ "Notes and Queries",
+ "Our Young Folks",
+ "Poetry, A Magazine of Verse",
+ "Popular Science Monthly",
+ "Prairie Farmer",
+ "Punch",
+ "Punchinello",
+ "Scientific American",
+ "The Scrap Book",
+ "Scribner's Magazine",
+ "The Speaker",
+ "The Stars and Stripes",
+ "The Strand Magazine",
+ "The Haslemere Museum Gazette",
+ "The Unpopular Review",
+ "The Writer",
+ "The Yellow Book",
+
+ };
+
+ static String psychology_and_philosophy[] = new String[]{"Bibliomania",
+ "Philosophy",
+ "Psychology",
+ "Witchcraft"
+ };
+
+ static String religion[] = new String[]{"Atheism",
+ "Bahá'í Faith",
+ "Buddhism",
+ "Christianity",
+ "Hinduism",
+ "Islam",
+ "Judaism",
+ "Latter Day Saints",
+ "Mythology",
+ "Paganism"
+ };
+
+ static String science[] = new String[]{"Astronomy",
+ "Biology",
+ "Botany",
+ "Chemistry",
+ "Ecology",
+ "Geology",
+ "Mathematics",
+ "Microbiology",
+ "Microscopy",
+ "Mycology",
+ "Natural History",
+ "Physics",
+ "Physiology",
+ "Science",
+ "Scientific American",
+ "Zoology"
+ };
+
+ static String social_sciences[] = new String[]{"Anarchism",
+ "Racism",
+ "Slavery",
+ "Sociology",
+ "Suffrage",
+ "Transportation"
+ };
+
+ static String technology[] = new String[]{"Cookery",
+ "Crafts",
+ "Engineering",
+ "Manufacturing",
+ "Technology",
+ "Woodwork"
+
+ };
+
+ static String wars[] = new String[]{"American Revolutionary War",
+ "Boer War",
+ "English Civil War",
+ "Spanish American War",
+ "US Civil War",
+ "World War I",
+ "World War II"
+ };
+
+ static{
+ sites.put(0, animals);
+ sites.put(1, children);
+ sites.put(2,classics);
+ sites.put(3, Countries);
+ sites.put(4, Crime);
+ sites.put(5, Knowledge);
+ sites.put(6, fiction);
+ sites.put(7, fine_arts);
+ sites.put(8, general_works);
+ sites.put(9, geography);
+ sites.put(10, history);
+ sites.put(11, language_and_literature);
+ sites.put(12, law);
+ sites.put(13, music);
+ sites.put(14, periodicals);
+ sites.put(15, psychology_and_philosophy);
+ sites.put(16, religion);
+ sites.put(17, science);
+ sites.put(18, social_sciences);
+ sites.put(19, technology);
+ sites.put(20, wars);
+ site2Link.put("Birds", "Animals-Wild_(Bookshelf)-Birds");
+ site2Link.put("Insects", "Animals-Wild_(Bookshelf)-Insects");
+ site2Link.put("Mammals", "Animals-Wild_(Bookshelf)-Mammals");
+ site2Link.put("Reptiles and Amphibians", "Animals-Wild_(Bookshelf)-Reptiles_and_Amphibians");
+ site2Link.put("Trapping", "Animals-Wild_(Bookshelf)-Trapping");
+
+ site2Link.put("Anthologies", "Children%27s_Anthologies_(Bookshelf)");
+ site2Link.put("Biography", "Children%27s_Biography_(Bookshelf)");
+ site2Link.put("Book Series", "Children%27s_Book_Series_(Bookshelf)");
+ site2Link.put("Verse", "Children%27s_Verse_(Bookshelf)");
+ site2Link.put("Christmas", "Christmas_(Bookshelf)");
+ site2Link.put("Fiction", "Children%27s_Fiction_(Bookshelf)");
+ site2Link.put("History", "Children%27s_History_(Bookshelf)");
+ site2Link.put("Instructional Books", "Children%27s_Instructional_Books_(Bookshelf)");
+ site2Link.put("Literature", "Children%27s_Literature_(Bookshelf)");
+ site2Link.put("Myths and Fairy Tales", "Children%27s_Myths,_Fairy_Tales,_etc._(Bookshelf)");
+ site2Link.put("Religion", "Children%27s_Religion_(Bookshelf)");
+ site2Link.put("School Stories", "School_Stories_(Bookshelf)");
+
+ site2Link.put("Classics", "Category:Classics_Bookshelf");
+
+ site2Link.put("Africa", "Africa_(Bookshelf)");
+ site2Link.put("Argentina", "Argentina_(Bookshelf)");
+ site2Link.put("Australia", "Australia_(Bookshelf)");
+ site2Link.put("Bulgaria","Bulgaria_(Bookshelf)");
+ site2Link.put("Canada","Canada_(Bookshelf)");
+ site2Link.put("Czech","Czech_(Bookshelf)");
+ site2Link.put("Egypt","Egypt_(Bookshelf)");
+ site2Link.put("France","France_(Bookshelf)");
+ site2Link.put("Germany","Germany_(Bookshelf)");
+ site2Link.put("Greece","Greece_(Bookshelf)");
+ site2Link.put("India","India_(Bookshelf)");
+ site2Link.put("Italy","Italy_(Bookshelf)");
+ site2Link.put("New Zealand","New_Zealand");
+ site2Link.put("Norway","Norway_(Bookshelf)");
+ site2Link.put("South Africa","South_Africa_(Bookshelf)");
+ site2Link.put("South America","South_America_(Bookshelf)");
+ site2Link.put("Travel","Travel_(Bookshelf)");
+ site2Link.put("United Kingdom","United_Kingdom_(Bookshelf)");
+ site2Link.put("United States","United_States_(Bookshelf)");
+
+ site2Link.put("Crime Fiction","Crime_Fiction_(Bookshelf)");
+ site2Link.put("Crime Non Fiction","Crime_Nonfiction_(Bookshelf)");
+ site2Link.put("Detective Fiction","Detective_Fiction_(Bookshelf)");
+ site2Link.put("Mystery Fiction","Mystery_Fiction_(Bookshelf)");
+
+ site2Link.put("Education","Education");
+ site2Link.put("Language Education","Language_Education_(Bookshelf)");
+
+ site2Link.put("Adventure","Adventure_(Bookshelf)");
+ site2Link.put("Children's Fiction","Children%27s_Fiction_(Bookshelf)");
+ site2Link.put("Crime Fiction","Crime_Fiction_(Bookshelf)");
+ site2Link.put("Detective Fiction","Detective_Fiction_(Bookshelf)");
+ site2Link.put("Erotic Fiction","Erotic_Fiction_(Bookshelf)");
+ site2Link.put("Fantasy","Fantasy_(Bookshelf)");
+ site2Link.put("General Fiction","General_Fiction");
+ site2Link.put("Gothic Fiction","Gothic_Fiction_(Bookshelf)");
+ site2Link.put("Historical Fiction","Historical_Fiction_(Bookshelf)");
+ site2Link.put("Horror","Horror_(Bookshelf)");
+ site2Link.put("Humor","Humor_(Bookshelf)");
+ site2Link.put("Movie Books","Movie_Books_(Bookshelf)");
+ site2Link.put("Mystery Fiction","Mystery_Fiction_(Bookshelf)");
+ site2Link.put("Precursors of Science Fiction","Precursors_of_Science_Fiction_(Bookshelf)");
+ site2Link.put("Romantic Fiction","Romantic_Fiction_(Bookshelf)");
+ site2Link.put("School Stories","School_Stories_(Bookshelf)");
+ site2Link.put("Science Fiction","Science_Fiction_(Bookshelf)");
+ site2Link.put("Western","Western_(Bookshelf)");
+
+ site2Link.put("Architecture","Architecture_(Bookshelf)");
+ site2Link.put("Art","Art_(Bookshelf)");
+
+ site2Link.put("Children's Periodicals:Dew Drops","Dew_Drops_(Bookshelf)");
+ site2Link.put("Children's Periodicals:The Girls Own Paper","The_Girls_Own_Paper_(Bookshelf)");
+ site2Link.put("Children's Periodicals:Golden Days for Boys and Girls","Golden_Days_for_Boys_and_Girls_(Bookshelf)");
+ site2Link.put("Children's Periodicals:The Great Round World And What Is Going On In It","The_Great_Round_World_And_What_Is_Going_On_In_It_(Bookshelf)");
+ site2Link.put("Children's Periodicals:The Nursery","The_Nursery_(Bookshelf)");
+ site2Link.put("Children's Periodicals:St. Nicholas Magazine for Boys and Girls","St._Nicholas_Magazine_for_Boys_and_Girls_(Bookshelf)");
+ site2Link.put("Reference","Reference_(Bookshelf)");
+
+ site2Link.put("Anthropology","Anthropology_(Bookshelf)");
+ site2Link.put("CIA World Factbooks","CIA_World_Factbooks_(Bookshelf)");
+ site2Link.put("Folklore","Folklore_(Bookshelf)");
+ site2Link.put("Maps and Cartography","Maps_and_Cartography_(Bookshelf)");
+ site2Link.put("Women's Travel Journals","Women%27s_Travel_Journals_(Bookshelf)");
+
+ site2Link.put("Archaeology","Archaeology_(Bookshelf)");
+ site2Link.put("Biographies","Biographies_(Bookshelf)");
+ site2Link.put("Children's History","Children%27s_History_(Bookshelf)");
+ site2Link.put("Classical Antiquity","Classical_Antiquity_(Bookshelf)");
+
+ site2Link.put("Esperanto","Esperanto_(Bookshelf)");
+ site2Link.put("German Language Books","German_Language_Books_(Bookshelf)");
+ site2Link.put("Language Education","Language_Education_(Bookshelf)");
+ site2Link.put("Plays","Plays_(Bookshelf)");
+
+ site2Link.put("British Law","British_Law_(Bookshelf)");
+ site2Link.put("Canon Law","Canon_Law");
+ site2Link.put("Noteworthy Trials","Noteworthy_Trials(Bookshelf)");
+ site2Link.put("United States Law","United_States_Law_(Bookshelf)");
+
+ site2Link.put("Music","Music_(Bookshelf)");
+ site2Link.put("Opera","Opera_(Bookshelf)");
+
+ site2Link.put("Bibliomania","Bibliomania_(Bookshelf)");
+ site2Link.put("Philosophy","Philosophy_(Bookshelf)");
+ site2Link.put("Psychology","Psychology_(Bookshelf)");
+ site2Link.put("Witchcraft","Witchcraft_(Bookshelf)");
+
+ site2Link.put("Atheism","Atheism_(Bookshelf)");
+ site2Link.put("Bahá'í Faith","Bahá%27í_Faith_(Bookshelf)");
+ site2Link.put("Buddhism","Buddhism_(Bookshelf)");
+ site2Link.put("Christianity","Christianity_(Bookshelf)");
+ site2Link.put("Hinduism","Hinduism_(Bookshelf)");
+ site2Link.put("Islam","Islam_(Bookshelf)");
+ site2Link.put("Judaism","Judaism_(Bookshelf)");
+ site2Link.put("Latter Day Saints","Latter_Day_Saints_(Bookshelf)");
+ site2Link.put("Mythology","Mythology_(Bookshelf)");
+ site2Link.put("Paganism","Paganism_(Bookshelf)");
+
+ site2Link.put("Astronomy","Astronomy_(Bookshelf)");
+ site2Link.put("Biology","Biology_(Bookshelf)");
+ site2Link.put("Botany","Botany_(Bookshelf)");
+ site2Link.put("Chemistry","Chemistry_(Bookshelf)");
+ site2Link.put("Ecology","Ecology_(Bookshelf)");
+ site2Link.put("Geology","Geology_(Bookshelf)");
+ site2Link.put("Mathematics","Mathematics_(Bookshelf)");
+ site2Link.put("Microbiology","Microbiology_(Bookshelf)");
+ site2Link.put("Microscopy","Microscopy_(Bookshelf)");
+ site2Link.put("Mycology","Mycology_(Bookshelf)");
+ site2Link.put("Natural History","Natural_History_(Bookshelf)");
+ site2Link.put("Physics","Physics_(Bookshelf)");
+ site2Link.put("Physiology","Physiology_(Bookshelf)");
+ site2Link.put("Science","Science");
+ site2Link.put("Scientific American","Scientific_American_(Bookshelf)");
+ site2Link.put("Zoology","Zoology_(Bookshelf)");
+
+ site2Link.put("Anarchism","Anarchism_(Bookshelf)");
+ site2Link.put("Racism","Racism_(Bookshelf)");
+ site2Link.put("Slavery","Slavery_(Bookshelf)");
+ site2Link.put("Sociology","Sociology_(Bookshelf)");
+ site2Link.put("Suffrage","Suffrage");
+ site2Link.put("Transportation","Transportation_(Bookshelf)");
+
+ site2Link.put("Energy Research","fenrg");
+ site2Link.put("ICT","fict");
+ site2Link.put("Materials","fmats");
+ site2Link.put("Mechanical Engineering","fmech");
+ site2Link.put("Robotics and AI","frobt");
+ site2Link.put("Communication","fcomm");
+ site2Link.put("Digital Humanities","fdigh");
+ site2Link.put("Sociology","fsoc");
+
+ site2Link.put("Cookery","Cookery_(Bookshelf)");
+ site2Link.put("Crafts","Crafts_(Bookshelf)");
+ site2Link.put("Engineering","Engineering_(Bookshelf)");
+ site2Link.put("Manufacturing","Manufacturing");
+ site2Link.put("Technology","Technology_(Bookshelf)");
+ site2Link.put("Woodwork","Woodwork");
+
+ site2Link.put("American Revolutionary War","American_Revolutionary_War_(Bookshelf)");
+ site2Link.put("Boer War","Boer_War_(Bookshelf)");
+ site2Link.put("English Civil War","English_Civil_War_(Bookshelf)");
+ site2Link.put("Spanish American War","Spanish_American_War_(Bookshelf)");
+ site2Link.put("US Civil War","US_Civil_War_(Bookshelf)");
+ site2Link.put("World War I","World_War_I_(Bookshelf)");
+ site2Link.put("World War II","World_War_II_(Bookshelf)");
+
+ site2Link.put("Ainslee's","Ainslee%27s_(Bookshelf)");
+ site2Link.put("The Aldine","The_Aldine_(Bookshelf)");
+ site2Link.put("The American Architect and Building News","The_American_Architect_and_Building_News_(Bookshelf)");
+ site2Link.put("The American Journal of Archaeology","The_American_Journal_of_Archaeology_(Bookshelf)");
+ site2Link.put("The American Missionary","The_American_Missionary_(Bookshelf)");
+ site2Link.put("The American Quarterly Review","The_American_Quarterly_Review_(Bookshelf)");
+ site2Link.put("The Arena","The_Arena_(Bookshelf)");
+ site2Link.put("The Argosy","The_Argosy_(Bookshelf)");
+ site2Link.put("Armour's Monthly Cook Book","Armour%27s_Monthly_Cook_Book_(Bookshelf)");
+ site2Link.put("Astounding Stories","Astounding_Stories_(Bookshelf)");
+ site2Link.put("The Atlantic Monthly","The_Atlantic_Monthly_(Bookshelf)");
+ site2Link.put("The Baptist Magazine","The_Baptist_Magazine_(Bookshelf)");
+ site2Link.put("Barnavännen","Barnavännen_(Bookshelf)");
+ site2Link.put("The Bay State Monthly","The_Bay_State_Monthly_(Bookshelf)");
+ site2Link.put("Bird-Lore","Bird-Lore_(Bookshelf)");
+ site2Link.put("Birds, Illustrated by Color Photography","Birds,_Illustrated_by_Color_Photography_(Bookshelf)");
+ site2Link.put("Blackwood's Edinburgh Magazine","Blackwood%27s_Edinburgh_Magazine_(Bookshelf)");
+ site2Link.put("The Botanical Magazine","The_Botanical_Magazine_(Bookshelf)");
+ site2Link.put("The Brochure Series of Architectural Illustration","The_Brochure_Series_of_Architectural_Illustration_(Bookshelf)");
+ site2Link.put("Buchanan's Journal of Man","Buchanan%27s_Journal_of_Man_(Bookshelf)");
+ site2Link.put("Bulletin of Lille","Bulletin_de_Lille_(Bookshelf)");
+ site2Link.put("The Catholic World","The_Catholic_World_(Bookshelf)");
+ site2Link.put("Celtic Magazine","Celtic_Magazine_(Bookshelf)");
+ site2Link.put("Chambers's Edinburgh Journal","Chambers%27s_Edinburgh_Journal_(Bookshelf)");
+ site2Link.put("The Christian Foundation","The_Christian_Foundation_(Bookshelf)");
+ site2Link.put("The Church of England Magazine","The_Church_of_England_Magazine_(Bookshelf)");
+ site2Link.put("The Contemporary Review","The_Contemporary_Review_(Bookshelf)");
+ site2Link.put("Continental Monthly","Continental_Monthly_(Bookshelf)");
+ site2Link.put("Current History","Current_History_(Bookshelf)");
+ site2Link.put("De Aarde en haar Volken","De_Aarde_en_haar_Volken_(Bookshelf)");
+ site2Link.put("Donahoe's Magazine","Donahoe%27s_Magazine_(Bookshelf)");
+ site2Link.put("The Economist","The_Economist_(Bookshelf)");
+ site2Link.put("The Esperantist","The_Esperantist_(Bookshelf)");
+ site2Link.put("The Galaxy","The_Galaxy_(Bookshelf)");
+ site2Link.put("Garden and Forest","Garden_and_Forest_(Bookshelf)");
+ site2Link.put("Godey's Lady's Book","Godey%27s_Lady%27s_Book_(Bookshelf)");
+ site2Link.put("Graham's Magazine","Graham%27s_Magazine_(Bookshelf)");
+ site2Link.put("Harper's New Monthly Magazine","Harper%27s_New_Monthly_Magazine_(Bookshelf)");
+ site2Link.put("Harper's Young People","Harper%27s_Young_People_(Bookshelf)");
+ site2Link.put("The Idler","The_Idler_(Bookshelf)");
+ site2Link.put("The Illustrated War News","The_Illustrated_War_News_(Bookshelf)");
+ site2Link.put("The International Magazine of Literature, Art, and Science","The_International_Magazine_of_Literature,_Art,_and_Science_(Bookshelf)");
+ site2Link.put("The Irish Ecclesiastical Record","The_Irish_Ecclesiastical_Record_(Bookshelf)");
+ site2Link.put("The Irish Penny Journal","The_Irish_Penny_Journal_(Bookshelf)");
+ site2Link.put("Journal of Entomology and Zoology","Journal_of_Entomology_and_Zoology_(Bookshelf)");
+ site2Link.put("The Journal of Negro History","The_Journal_of_Negro_History_(Bookshelf)");
+ site2Link.put("The Knickerbocker","The_Knickerbocker_(Bookshelf)");
+ site2Link.put("L'Illustration","L%27Illustration_(Bookshelf)");
+ site2Link.put("Lippincott's Magazine","Lippincott%27s_Magazine_(Bookshelf)");
+ site2Link.put("Little Folks","Little_Folks_(Bookshelf)");
+ site2Link.put("London Medical Gazette","London_Medical_Gazette");
+ site2Link.put("The Mayflower","The_Mayflower_(Bookshelf)");
+ site2Link.put("McClure's Magazine","McClure%27s_Magazine_(Bookshelf)");
+ site2Link.put("The Menorah Journal","The_Menorah_Journal_(Bookshelf)");
+ site2Link.put("The Mentor","The_Mentor_(Bookshelf)");
+ site2Link.put("The Mirror of Literature, Amusement, and Instruction","The_Mirror_of_Literature,_Amusement,_and_Instruction_(Bookshelf)");
+ site2Link.put("The Mirror of Taste, and Dramatic Censor","The_Mirror_of_Taste,_and_Dramatic_Censor_(Bookshelf)");
+ site2Link.put("Mother Earth","Mother_Earth_(Bookshelf)");
+ site2Link.put("Mrs Whittelsey's Magazine for Mothers and Daughters","Mrs_Whittelsey%27s_Magazine_for_Mothers_and_Daughters_(Bookshelf)");
+ site2Link.put("The National Preacher","The_National_Preacher_(Bookshelf)");
+ site2Link.put("The North American Medical and Surgical Journal","The_North_American_Medical_and_Surgical_Journal_(Bookshelf)");
+ site2Link.put("Northern Nut Growers Association","Northern_Nut_Growers_Association_(Bookshelf)");
+ site2Link.put("Notes and Queries","Notes_and_Queries_(Bookshelf)");
+ site2Link.put("Our Young Folks","Our_Young_Folks_(Bookshelf)");
+ site2Link.put("Poetry, A Magazine of Verse","Poetry,_A_Magazine_of_Verse_(Bookshelf)");
+ site2Link.put("Popular Science Monthly","Popular_Science_Monthly_(Bookshelf)");
+ site2Link.put("Prairie Farmer","Prairie_Farmer_(Bookshelf)");
+ site2Link.put("Punch","Punch_(Bookshelf)");
+ site2Link.put("Punchinello","Punchinello_(Bookshelf)");
+ site2Link.put("Scientific American","Scientific_American_(Bookshelf)");
+ site2Link.put("The Scrap Book","The_Scrap_Book_(Bookshelf)");
+ site2Link.put("Scribner's Magazine","Scribner%27s_Magazine_(Bookshelf)");
+ site2Link.put("The Speaker","The_Speaker_(Bookshelf)");
+ site2Link.put("The Stars and Stripes","The_Stars_and_Stripes_(Bookshelf)");
+ site2Link.put("The Strand Magazine","The_Strand_Magazine_(Bookshelf)");
+ site2Link.put("The Haslemere Museum Gazette","The_Haslemere_Museum_Gazette_(Bookshelf)");
+ site2Link.put("The Unpopular Review","The_Unpopular_Review_(Bookshelf)");
+ site2Link.put("The Writer","The_Writer_(Bookshelf)");
+ site2Link.put("The Yellow Book","The_Yellow_Book_(Bookshelf)");
+
+
+ }
+}
diff --git a/edu.usc.cssl.tacit.crawlers.gutenberg/src/edu/usc/cssl/tacit/crawlers/gutenberg/services/GutenbergMain.java b/edu.usc.cssl.tacit.crawlers.gutenberg/src/edu/usc/cssl/tacit/crawlers/gutenberg/services/GutenbergMain.java
new file mode 100644
index 00000000..0f6e033b
--- /dev/null
+++ b/edu.usc.cssl.tacit.crawlers.gutenberg/src/edu/usc/cssl/tacit/crawlers/gutenberg/services/GutenbergMain.java
@@ -0,0 +1,165 @@
+package edu.usc.cssl.tacit.crawlers.gutenberg.services;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+
+import org.eclipse.core.runtime.IProgressMonitor;
+import org.jsoup.HttpStatusException;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+import com.fasterxml.jackson.core.JsonEncoding;
+import com.fasterxml.jackson.core.JsonFactory;
+import com.fasterxml.jackson.core.JsonGenerator;
+
+import edu.usc.cssl.tacit.common.ui.views.ConsoleView;
+
+//import edu.usc.cssl.tacit.crawlers.frontier.services.FrontierConstants;
+
+
+public class GutenbergMain {
+ JsonFactory jsonFactory;
+ JsonGenerator jsonGenerator;
+ IProgressMonitor monitor;
+
+ public void crawl(String dir, String domain, int limit,IProgressMonitor monitor) throws IOException{
+ ConsoleView.printlInConsoleln("For Sub Domain: " + domain);
+ //System.out.println("I am in crawl");
+ this.monitor = monitor;
+ //System.out.println("dir------" + dir);
+ //System.out.println("domain---" + domain);
+ //System.out.println("limit-----" + limit);
+ ArrayList temp = new ArrayList();
+ int downloadCount = 0;
+ jsonFactory = new JsonFactory();
+ //this.monitor = monitor;
+ File streamFile = new File(dir+File.separator+domain+".json");
+ int count =0;
+ int downloadedCount =0;
+ File f=null;
+ System.out.println("I am in while---and count is ----" + count);
+ try {
+ jsonGenerator = jsonFactory.createGenerator(streamFile, JsonEncoding.UTF8);
+ jsonGenerator.useDefaultPrettyPrinter();
+ jsonGenerator.writeStartArray();
+
+ f = new File(dir+File.separator+domain+".txt");
+ //System.out.println("File Name--------" + dir+File.separator+domain+".txt");
+ //String domain1= domain.replaceAll("\\s+", "_");
+ //String domain2 = domain1.replace("'", "%27");
+ String domain2 = GutenbergConstants.site2Link.get(domain);
+ //String site = IGutenbergConstants.BASE_URL_DOMAIN + domain2 + "_(Bookshelf)";
+ String site = IGutenbergConstants.BASE_URL_DOMAIN + domain2;
+ //System.out.println("domain2-----" + domain2);
+ System.out.println("site---------" + site);
+ Document d = Jsoup.connect(site).timeout(60*1000).get();
+ Elements certainlinks = d.select("a[href*=www.gutenberg.org/ebooks/]");
+ for (Element table : certainlinks){
+ Element a = table.select("a").first();
+ String linkStr = a.attr("href");
+ //System.out.println(linkStr);
+ int lastIndex = linkStr.lastIndexOf('/');
+ String s2 = linkStr.substring(lastIndex+1);
+ if (s2.matches("[-+]?\\d*\\.?\\d+"))
+ {
+ temp.add(s2);
+ }
+ //System.out.println(s2);
+
+ }
+ if(limit>temp.size())
+ {
+ limit = temp.size();
+ }
+ //System.out.println("******************************************************");
+ //System.out.println(temp);
+ //System.out.println("******************************************************");
+ jsonGenerator.writeStartObject();
+ //System.out.println("Size of temp-------" + temp.size());
+ int tempSize = temp.size();
+ monitor.worked(1);
+
+
+ } catch (IOException e2) {
+ e2.printStackTrace();
+ }
+
+ while(true){
+ if(limit==downloadedCount)
+ break;
+ try{
+ //System.out.println("I am in while-------------||||||||||||||||||||||||||------------------------");
+
+ for (int i = downloadCount;downloadedCount temp = new ArrayList();
+ int downloadCount = 0;
+ int downloadedCount =0;
+ this.monitor =monitor;
+ jsonFactory = new JsonFactory();
+ File streamFile = new File(dir+File.separator+ "Latest Search"+".json");
+ try {
+ jsonGenerator = jsonFactory.createGenerator(streamFile, JsonEncoding.UTF8);
+ jsonGenerator.useDefaultPrettyPrinter();
+ jsonGenerator.writeStartArray();
+ } catch (IOException e2) {
+ e2.printStackTrace();
+ }
+ File f=null;
+ try
+ {
+ f = new File(dir+File.separator+ "Latest Search" +".txt");
+ monitor.worked(1);
+ String site = IGutenbergConstants.LATEST_SERACH;
+ System.out.println("site-----" + site);
+ Document d = Jsoup.connect(site).timeout(60*1000).get();
+ Elements certainlinks = d.select("a[href*=/ebooks/]");
+ //System.out.println(certainlinks);
+ for (Element table : certainlinks){
+ Element a = table.select("a").first();
+ String linkStr = a.attr("href");
+ //System.out.println(linkStr);
+ int lastIndex = linkStr.lastIndexOf('/');
+ String s2 = linkStr.substring(lastIndex+1);
+ if (s2.matches("[-+]?\\d*\\.?\\d+"))
+ {
+ temp.add(s2);
+ }
+ }
+ if(limit>temp.size())
+ {
+ limit = temp.size();
+ }
+ //System.out.println("temp ka size=======" + temp.size());
+ //System.out.println("Limit given========" + limit);
+ //System.out.println(temp);
+ }
+ catch(HttpStatusException e1){
+ if(e1.getStatusCode() == 412 || e1.getStatusCode() == 404)
+ {
+ System.out.println("Error Status Code is ----" + e1.getStatusCode());
+ }
+ }
+
+ while(true){
+ if(limit==downloadedCount)
+ break;
+ //System.out.println("******************I am in if***************************");
+ for(int i = downloadCount;downloadedCount" + numOfebook);
+ String titleSite = IGutenbergConstants.TITLE_BASE_URL + numOfebook + "/" + numOfebook + "-h/" + numOfebook + "-h.htm";
+ //System.out.println(titleSite);
+ Document e = Jsoup.connect(titleSite).timeout(60*1000).get();
+ Element title = e.select("title").first();
+ //System.out.println(title);
+ String contentSite = IGutenbergConstants.CONTENT_BASE_URL + numOfebook + "/pg" + numOfebook + ".txt";
+ System.out.println("=============>>>>>>" + contentSite);
+ Document g = Jsoup.connect(contentSite).timeout(60*1000).get();
+ Response response = Jsoup.connect(contentSite).execute();
+ ConsoleView.printlInConsoleln("Writing topic: "+ Jsoup.parse(title.toString()).text());
+ jsonGenerator.writeStartObject();
+ jsonGenerator.writeObjectField("title", Jsoup.parse(title.toString()).text());
+ jsonGenerator.writeObjectField("abstract_body", Jsoup.parse(g.toString()).text());
+ jsonGenerator.writeEndObject();
+ downloadCount++;
+ downloadedCount++;
+ if(i==temp.size()-1 && downloadedCount!=limit)
+ {
+ //System.out.println("*************************************************");
+ temp = searchNextPage(temp,limit);
+ }
+ //System.out.println("Download Count is=" + downloadCount);
+ //System.out.println("Downloaded Count is=" + downloadedCount);
+ }
+ catch(HttpStatusException e1){
+ if(e1.getStatusCode() == 412 || e1.getStatusCode() == 404)
+ {
+ System.out.println("Error Status Code is ----" + e1.getStatusCode());
+ //System.out.println("Continuing after error");
+ downloadCount++;
+ //System.out.println("Download Count in Catch=" + downloadCount);
+ //System.out.println("Downloaded Count in Catch=" + downloadedCount);
+ //System.out.println("Last Book in Catch=" + lastbook);
+ if(i==temp.size()-1 && downloadedCount!=limit)
+ {
+ //System.out.println("Searching nextttttttttttt paggeeeeeeeeeee");
+ temp = searchNextPage(temp,limit);
+ }
+
+ continue;
+ }
+ }
+
+ }
+ jsonGenerator.writeEndArray();
+ }
+
+ //System.out.println("*****************I am out of if******************");
+ //System.out.println("I am out of extreme");
+ // System.out.println(temp);
+ //System.out.println("temp ka size=======" + temp.size());
+ try {
+ jsonGenerator.close();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ System.out.println("Number of Downloads--" + downloadedCount);
+
+}
+
+ private ArrayList searchNextPage(ArrayList temp, int limit) throws IOException {
+ //System.out.println("&&&&&&&&&&&&&&&&&|||||||||||||||||||||||||||||I am indise while||||||||||||||||||||||||||||||||&&&&&&&&&&&&&&&&&");
+ int diff = limit-temp.size();
+ //System.out.println("Difference is----" + diff);
+ //System.out.println("nextpage index is---" + nextpageindex);
+ String site2 = IGutenbergConstants.POPULAR_SEARCH + "&start_index=" + nextpageindex;
+ //System.out.println("site2********************" + site2);
+ Document h = Jsoup.connect(site2).timeout(60*1000).get();
+ Elements certainlinks1 = h.select("a[href*=/ebooks/]");
+ for (Element table : certainlinks1){
+ Element a = table.select("a").first();
+ String linkStr = a.attr("href");
+ //System.out.println(linkStr);
+ int lastIndex = linkStr.lastIndexOf('/');
+ String s2 = linkStr.substring(lastIndex+1);
+ if (s2.matches("[-+]?\\d*\\.?\\d+"))
+ {
+ temp.add(s2);
+ }
+ }
+ nextpageindex = nextpageindex + 25;
+ lastbook = lastbook + 25;
+ //System.out.println("Last Book in method=" + lastbook);
+ //System.out.println(temp);
+ //System.out.println("Size of temp----" + temp.size());
+ // TODO Auto-generated method stub
+ return temp;
+ }
+}
diff --git a/edu.usc.cssl.tacit.crawlers.gutenberg/src/edu/usc/cssl/tacit/crawlers/gutenberg/services/SearchPopular.java b/edu.usc.cssl.tacit.crawlers.gutenberg/src/edu/usc/cssl/tacit/crawlers/gutenberg/services/SearchPopular.java
new file mode 100644
index 00000000..5c17db6b
--- /dev/null
+++ b/edu.usc.cssl.tacit.crawlers.gutenberg/src/edu/usc/cssl/tacit/crawlers/gutenberg/services/SearchPopular.java
@@ -0,0 +1,226 @@
+package edu.usc.cssl.tacit.crawlers.gutenberg.services;
+
+
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+
+//import org.eclipse.core.runtime.IProgressMonitor;
+//import org.jsoup.HttpStatusException;
+import org.jsoup.Jsoup;
+import org.jsoup.Connection.Response;
+import org.eclipse.core.runtime.IProgressMonitor;
+import org.jsoup.HttpStatusException;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+
+import com.fasterxml.jackson.core.JsonEncoding;
+import com.fasterxml.jackson.core.JsonFactory;
+import com.fasterxml.jackson.core.JsonGenerator;
+
+import edu.usc.cssl.tacit.common.ui.views.ConsoleView;
+
+
+public class SearchPopular {
+ JsonFactory jsonFactory;
+ JsonGenerator jsonGenerator;
+ //IProgressMonitor monitor;
+ public int nextpageindex=26;
+ ArrayList temp = new ArrayList();
+ ArrayList checktemp = new ArrayList();
+ IProgressMonitor monitor;
+
+
+ //popular method will go the site and get the books in arraylist called temp
+ public void popular(String dir, int limit,String query,IProgressMonitor monitor) throws IOException {
+ int downloadCount = 0;
+ int downloadedCount =0;
+ this.monitor = monitor;
+ jsonFactory = new JsonFactory();
+ File streamFile = new File(dir+File.separator+ query +".json");
+ try {
+ jsonGenerator = jsonFactory.createGenerator(streamFile, JsonEncoding.UTF8);
+ jsonGenerator.useDefaultPrettyPrinter();
+ jsonGenerator.writeStartArray();
+ } catch (IOException e2) {
+ e2.printStackTrace();
+ }
+ File f=null;
+ try
+ {
+ f = new File(dir+File.separator+ query +".txt");
+ monitor.worked(1);
+ String site = IGutenbergConstants.POPULAR_SEARCH+query;
+ System.out.println("site-----" + site);
+ Document d = Jsoup.connect(site).timeout(60*1000).get();
+ Elements certainlinks = d.select("a[href*=/ebooks/]");
+ //System.out.println(certainlinks);
+ for (Element table : certainlinks){
+ Element a = table.select("a").first();
+ String linkStr = a.attr("href");
+ //System.out.println(linkStr);
+ int lastIndex = linkStr.lastIndexOf('/');
+ String s2 = linkStr.substring(lastIndex+1);
+ if (s2.matches("[-+]?\\d*\\.?\\d+"))
+ {
+ temp.add(s2);
+ }
+ }
+ //System.out.println("temp ka size=======" + temp.size());
+ //System.out.println("Limit given========" + limit);
+ //System.out.println(temp);
+ monitor.worked(1);
+ if(temp.size()==0)
+ {
+ //when search result returns nothing. i.e no books according to user search
+ ConsoleView.printlInConsoleln("No books found according to your search.Kindly change your search options or try again later.");
+ }
+ if((limit==temp.size() || limit < temp.size() || limit>temp.size()) && temp.size()!=0)
+ {
+ //Two of the three cases i told you
+ lessThanOrEqualTo(limit,query,temp);
+ }
+ }
+ catch(HttpStatusException e1){
+ if(e1.getStatusCode() == 412 || e1.getStatusCode() == 404)
+ {
+ System.out.println("Error Status Code is ----" + e1.getStatusCode());
+ }
+ }
+
+ try {
+ jsonGenerator.close();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ //System.out.println("Number of Downloads-|||||||||||||||||||||||||||||||||||" + downloadCount);
+
+ }
+
+
+ /*Cases :
+ * Case 0 : limit 5, temp 25, can get more in temp, 0 fails of first 25.
+ * Case 1: limit 5, temp 25, can get more in temp, 22 fails of first 25
+ * Case 2: limit 27, temp 25, can get more in temp, 10 fails of first 25
+ * Case 3: limit 10, temp 25, can't get more in temp, 20 fails of first 25.
+ * Case 4: limit 20, temp 15, can't get more in temp, 0 fails of first 25.
+ *
+ */
+ //lessthanOrEqualTo method will now extract title and content from the books in temp
+ private void lessThanOrEqualTo(int limit, String query, ArrayList temp) throws IOException {
+ int downloadedCount =0;
+ //int downloadCount =0;
+ while(true){
+ if(downloadedCount==limit)
+ {
+ break;
+ }
+ for(int i = 0;i" + numOfebook);
+ String titleSite = IGutenbergConstants.TITLE_BASE_URL + numOfebook + "/" + numOfebook + "-h/" + numOfebook + "-h.htm";
+ //System.out.println(titleSite);
+ Document e = Jsoup.connect(titleSite).timeout(60*1000).get();
+ Element title = e.select("title").first();
+ //System.out.println(title);
+ String contentSite = IGutenbergConstants.CONTENT_BASE_URL + numOfebook + "/pg" + numOfebook + ".txt";
+ //System.out.println("=============>>>>>>" + contentSite);
+ Document g = Jsoup.connect(contentSite).timeout(60*1000).get();
+ Response response = Jsoup.connect(contentSite).execute();
+ ConsoleView.printlInConsoleln("Writing topic: "+ Jsoup.parse(title.toString()).text());
+ jsonGenerator.writeStartObject();
+ jsonGenerator.writeObjectField("title", Jsoup.parse(title.toString()).text());
+ jsonGenerator.writeObjectField("abstract_body", Jsoup.parse(g.toString()).text());
+ jsonGenerator.writeEndObject();
+ //downloadCount++;
+ downloadedCount++;
+ //System.out.println("Download Count is=" + downloadCount);
+ //System.out.println("Downloaded Count is=" + downloadedCount);
+ }
+ catch(HttpStatusException e1){
+ if(e1.getStatusCode() == 412 || e1.getStatusCode() == 404)
+ {
+ System.out.println("Error Status Code is ----" + e1.getStatusCode());
+ //System.out.println("Continuing after error");
+ //downloadCount++;
+ //System.out.println("Download Count in Catch=" + downloadCount);
+ //System.out.println("Downloaded Count in Catch=" + downloadedCount);
+ continue;
+ }
+ }
+ }
+ if(downloadedCount!=limit)
+ {
+ //This is a check when end of temp is reached and also downloadedCount is not equal to limit so we need to search next page
+ //System.out.println("I should search next page now probably");
+ //downloadCount++;
+ checktemp = searchnextpage(query,limit); //searchnextpage will search the nextpage of site and return next ebooks that will be stored in checktemp
+ if(checktemp.isEmpty())
+ {
+ //means next page is empty
+ //System.out.println("Next temp not found");
+ break;
+ }
+ else
+ {
+ temp.clear();
+ temp.addAll(checktemp);
+ }
+ }
+
+ }
+ jsonGenerator.writeEndArray();
+ if(downloadedCount==0)
+ {
+ ConsoleView.printlInConsoleln("No books found according to your search.Kindly change your search options or try again later.");
+ }
+ ConsoleView.printlInConsoleln(downloadedCount + " book(s) downloaded according to specified search result.");
+ }
+
+
+
+ private ArrayList searchnextpage(String query, int limit) throws IOException {
+ System.out.println("&&&&&&&&&&&&&&&&&I am indise SEARCHNEXT&&&&&&&&&&&&&&&&&");
+ ArrayList newtemp = new ArrayList();
+ int diff = limit-temp.size();
+ System.out.println("Difference is----" + diff);
+ System.out.println("nextpage index is---" + nextpageindex);
+ String site2 = IGutenbergConstants.POPULAR_SEARCH + query + "&start_index=" + nextpageindex;
+ System.out.println("site2********************" + site2);
+ Document h = Jsoup.connect(site2).timeout(60*1000).get();
+ Elements certainlinks1 = h.select("a[href*=/ebooks/]");
+ for (Element table : certainlinks1){
+ Element a = table.select("a").first();
+ String linkStr = a.attr("href");
+ //System.out.println(linkStr);
+ int lastIndex = linkStr.lastIndexOf('/');
+ String s2 = linkStr.substring(lastIndex+1);
+ if (s2.matches("[-+]?\\d*\\.?\\d+"))
+ {
+ newtemp.add(s2);
+ }
+ }
+ nextpageindex = nextpageindex + 25;
+ //System.out.println("Lets check new temp");
+ //System.out.println("############################################");
+ //System.out.println(newtemp);
+ //System.out.println("############################################");
+ return newtemp;
+
+ }
+
+
+
+
+}
+
diff --git a/edu.usc.cssl.tacit.feature/feature.xml b/edu.usc.cssl.tacit.feature/feature.xml
index 20a09390..3db8a305 100644
--- a/edu.usc.cssl.tacit.feature/feature.xml
+++ b/edu.usc.cssl.tacit.feature/feature.xml
@@ -542,5 +542,20 @@
version="0.0.0"
unpack="false"/>
+
+
+
+
+
diff --git a/edu.usc.cssl.tacit.repository/plugin.xml b/edu.usc.cssl.tacit.repository/plugin.xml
index 4e3c64f7..18f6f5e6 100644
--- a/edu.usc.cssl.tacit.repository/plugin.xml
+++ b/edu.usc.cssl.tacit.repository/plugin.xml
@@ -290,6 +290,15 @@
relative="org.eclipse.ui.editorss"
visible="false">
+
+
+
+
diff --git a/parent/pom.xml b/parent/pom.xml
index 3326afc0..bcc85831 100644
--- a/parent/pom.xml
+++ b/parent/pom.xml
@@ -54,6 +54,9 @@
../edu.usc.cssl.tacit.crawlers.govtrack.ui
../edu.usc.cssl.tacit.crawlers.govtrack
+
+ ../edu.usc.cssl.tacit.crawlers.gutenberg.ui
+ ../edu.usc.cssl.tacit.crawlers.gutenberg