diff --git a/edu.usc.cssl.tacit.common.ui/src/edu/usc/cssl/tacit/common/ui/corpusmanagement/services/CMDataType.java b/edu.usc.cssl.tacit.common.ui/src/edu/usc/cssl/tacit/common/ui/corpusmanagement/services/CMDataType.java index 654f24b6..9f1a3fda 100644 --- a/edu.usc.cssl.tacit.common.ui/src/edu/usc/cssl/tacit/common/ui/corpusmanagement/services/CMDataType.java +++ b/edu.usc.cssl.tacit.common.ui/src/edu/usc/cssl/tacit/common/ui/corpusmanagement/services/CMDataType.java @@ -1,7 +1,7 @@ package edu.usc.cssl.tacit.common.ui.corpusmanagement.services; public enum CMDataType { - JSON,REDDIT_JSON, TWITTER_JSON, STACKEXCHANGE_JSON, FRONTIER_JSON, TYPEPAD_JSON, CONGRESS_JSON, PLAIN_TEXT, XML, MICROSOFT_WORD, PRESIDENCY_JSON, HANSARD_JSON, IMPORTED_CSV, PLOSONE_JSON, GOVTRACK_JSON, LATIN_JSON; + JSON,REDDIT_JSON, TWITTER_JSON, STACKEXCHANGE_JSON, FRONTIER_JSON, TYPEPAD_JSON, CONGRESS_JSON, PLAIN_TEXT, XML, MICROSOFT_WORD, PRESIDENCY_JSON, HANSARD_JSON, IMPORTED_CSV, PLOSONE_JSON, GOVTRACK_JSON, LATIN_JSON, GUTENBERG_JSON; public static CMDataType get(String dataType) { if(dataType.equals("PLAIN_TEXT")) return CMDataType.PLAIN_TEXT; @@ -20,6 +20,7 @@ public static CMDataType get(String dataType) { else if(dataType.equals("PLOSONE_JSON")) return CMDataType.PLOSONE_JSON; else if(dataType.equals("GOVTRACK_JSON")) return CMDataType.GOVTRACK_JSON; else if(dataType.equals("LATIN_JSON")) return CMDataType.LATIN_JSON; + else if(dataType.equals("GUTENBERG_JSON")) return CMDataType.GUTENBERG_JSON; return null; } } diff --git a/edu.usc.cssl.tacit.crawlers.gutenberg.ui/META-INF/MANIFEST.MF b/edu.usc.cssl.tacit.crawlers.gutenberg.ui/META-INF/MANIFEST.MF new file mode 100644 index 00000000..7504df20 --- /dev/null +++ b/edu.usc.cssl.tacit.crawlers.gutenberg.ui/META-INF/MANIFEST.MF @@ -0,0 +1,16 @@ +Manifest-Version: 1.0 +Bundle-ManifestVersion: 2 +Bundle-Name: Ui +Bundle-SymbolicName: edu.usc.cssl.tacit.crawlers.gutenberg.ui;singleton:=true +Bundle-Version: 1.0.0.qualifier +Bundle-Activator: edu.usc.cssl.tacit.crawlers.gutenberg.ui.Activator +Require-Bundle: org.eclipse.ui, + org.eclipse.core.runtime, + org.eclipse.ui.forms;bundle-version="3.6.200", + edu.usc.cssl.tacit.common.ui;bundle-version="1.0.0", + edu.usc.cssl.tacit.help;bundle-version="1.0.0", + org.eclipse.help;bundle-version="3.6.0", + edu.usc.cssl.tacit.common;bundle-version="1.0.0", + edu.usc.cssl.tacit.crawlers.gutenberg;bundle-version="1.0.0" +Bundle-RequiredExecutionEnvironment: JavaSE-1.7 +Bundle-ActivationPolicy: lazy diff --git a/edu.usc.cssl.tacit.crawlers.gutenberg.ui/build.properties b/edu.usc.cssl.tacit.crawlers.gutenberg.ui/build.properties new file mode 100644 index 00000000..e9863e28 --- /dev/null +++ b/edu.usc.cssl.tacit.crawlers.gutenberg.ui/build.properties @@ -0,0 +1,5 @@ +source.. = src/ +output.. = bin/ +bin.includes = META-INF/,\ + .,\ + plugin.xml diff --git a/edu.usc.cssl.tacit.crawlers.gutenberg.ui/icons/GutenbergCrawlerIcon.png b/edu.usc.cssl.tacit.crawlers.gutenberg.ui/icons/GutenbergCrawlerIcon.png new file mode 100644 index 00000000..23163601 Binary files /dev/null and b/edu.usc.cssl.tacit.crawlers.gutenberg.ui/icons/GutenbergCrawlerIcon.png differ diff --git a/edu.usc.cssl.tacit.crawlers.gutenberg.ui/icons/file_obj.gif b/edu.usc.cssl.tacit.crawlers.gutenberg.ui/icons/file_obj.gif new file mode 100644 index 00000000..7ccc6a70 Binary files /dev/null and b/edu.usc.cssl.tacit.crawlers.gutenberg.ui/icons/file_obj.gif differ diff --git a/edu.usc.cssl.tacit.crawlers.gutenberg.ui/icons/fldr_obj.gif b/edu.usc.cssl.tacit.crawlers.gutenberg.ui/icons/fldr_obj.gif new file mode 100644 index 00000000..51e703b1 Binary files /dev/null and b/edu.usc.cssl.tacit.crawlers.gutenberg.ui/icons/fldr_obj.gif differ diff --git a/edu.usc.cssl.tacit.crawlers.gutenberg.ui/icons/help_contents.gif b/edu.usc.cssl.tacit.crawlers.gutenberg.ui/icons/help_contents.gif new file mode 100644 index 00000000..9d70301d Binary files /dev/null and b/edu.usc.cssl.tacit.crawlers.gutenberg.ui/icons/help_contents.gif differ diff --git a/edu.usc.cssl.tacit.crawlers.gutenberg.ui/icons/lrun_obj.gif b/edu.usc.cssl.tacit.crawlers.gutenberg.ui/icons/lrun_obj.gif new file mode 100644 index 00000000..57f41022 Binary files /dev/null and b/edu.usc.cssl.tacit.crawlers.gutenberg.ui/icons/lrun_obj.gif differ diff --git a/edu.usc.cssl.tacit.crawlers.gutenberg.ui/plugin.xml b/edu.usc.cssl.tacit.crawlers.gutenberg.ui/plugin.xml new file mode 100644 index 00000000..1e289eae --- /dev/null +++ b/edu.usc.cssl.tacit.crawlers.gutenberg.ui/plugin.xml @@ -0,0 +1,41 @@ + + + + + + + + + + + + + + + + + + + + + + + diff --git a/edu.usc.cssl.tacit.crawlers.gutenberg.ui/src/edu/usc/cssl/tacit/crawlers/gutenberg/ui/Activator.java b/edu.usc.cssl.tacit.crawlers.gutenberg.ui/src/edu/usc/cssl/tacit/crawlers/gutenberg/ui/Activator.java new file mode 100644 index 00000000..fb8b0f05 --- /dev/null +++ b/edu.usc.cssl.tacit.crawlers.gutenberg.ui/src/edu/usc/cssl/tacit/crawlers/gutenberg/ui/Activator.java @@ -0,0 +1,50 @@ +package edu.usc.cssl.tacit.crawlers.gutenberg.ui; + +import org.eclipse.ui.plugin.AbstractUIPlugin; +import org.osgi.framework.BundleContext; + +/** + * The activator class controls the plug-in life cycle + */ +public class Activator extends AbstractUIPlugin { + + // The plug-in ID + public static final String PLUGIN_ID = "edu.usc.cssl.tacit.crawlers.gutenberg.ui"; //$NON-NLS-1$ + + // The shared instance + private static Activator plugin; + + /** + * The constructor + */ + public Activator() { + } + + /* + * (non-Javadoc) + * @see org.eclipse.ui.plugin.AbstractUIPlugin#start(org.osgi.framework.BundleContext) + */ + public void start(BundleContext context) throws Exception { + super.start(context); + plugin = this; + } + + /* + * (non-Javadoc) + * @see org.eclipse.ui.plugin.AbstractUIPlugin#stop(org.osgi.framework.BundleContext) + */ + public void stop(BundleContext context) throws Exception { + plugin = null; + super.stop(context); + } + + /** + * Returns the shared instance + * + * @return the shared instance + */ + public static Activator getDefault() { + return plugin; + } + +} diff --git a/edu.usc.cssl.tacit.crawlers.gutenberg.ui/src/edu/usc/cssl/tacit/crawlers/gutenberg/ui/GutenbergCrawlerView.java b/edu.usc.cssl.tacit.crawlers.gutenberg.ui/src/edu/usc/cssl/tacit/crawlers/gutenberg/ui/GutenbergCrawlerView.java new file mode 100644 index 00000000..e8aee9ab --- /dev/null +++ b/edu.usc.cssl.tacit.crawlers.gutenberg.ui/src/edu/usc/cssl/tacit/crawlers/gutenberg/ui/GutenbergCrawlerView.java @@ -0,0 +1,1162 @@ +package edu.usc.cssl.tacit.crawlers.gutenberg.ui; + +import java.awt.Image; +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.text.DateFormat; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Date; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.eclipse.core.runtime.IProgressMonitor; +import org.eclipse.core.runtime.IStatus; +import org.eclipse.core.runtime.Status; +import org.eclipse.core.runtime.jobs.IJobChangeEvent; +import org.eclipse.core.runtime.jobs.Job; +import org.eclipse.core.runtime.jobs.JobChangeAdapter; +import org.eclipse.jface.action.Action; +import org.eclipse.jface.action.IToolBarManager; +import org.eclipse.jface.dialogs.IMessageProvider; +import org.eclipse.jface.layout.GridDataFactory; +import org.eclipse.jface.layout.GridLayoutFactory; +import org.eclipse.jface.resource.ImageDescriptor; +import org.eclipse.jface.viewers.ILabelProvider; +import org.eclipse.jface.viewers.LabelProvider; +import org.eclipse.jface.window.Window; +import org.eclipse.swt.SWT; +import org.eclipse.swt.custom.ScrolledComposite; +import org.eclipse.swt.events.KeyEvent; +import org.eclipse.swt.events.KeyListener; +import org.eclipse.swt.events.SelectionAdapter; +import org.eclipse.swt.events.SelectionEvent; +import org.eclipse.swt.events.SelectionListener; +import org.eclipse.swt.layout.GridData; +import org.eclipse.swt.layout.GridLayout; +import org.eclipse.swt.widgets.Button; +import org.eclipse.swt.widgets.Combo; +import org.eclipse.swt.widgets.Composite; +import org.eclipse.swt.widgets.DirectoryDialog; +import org.eclipse.swt.widgets.Display; +import org.eclipse.swt.widgets.Group; +import org.eclipse.swt.widgets.Label; +import org.eclipse.swt.widgets.Table; +import org.eclipse.swt.widgets.TableItem; +import org.eclipse.swt.widgets.Text; +import org.eclipse.swt.widgets.Tree; +import org.eclipse.swt.widgets.TreeItem; +import org.eclipse.ui.PlatformUI; +import org.eclipse.ui.dialogs.ElementListSelectionDialog; +import org.eclipse.ui.dialogs.PreferencesUtil; +import org.eclipse.ui.forms.IFormColors; +import org.eclipse.ui.forms.events.HyperlinkEvent; +import org.eclipse.ui.forms.events.IHyperlinkListener; +import org.eclipse.ui.forms.widgets.FormToolkit; +import org.eclipse.ui.forms.widgets.Hyperlink; +import org.eclipse.ui.forms.widgets.ScrolledForm; +import org.eclipse.ui.forms.widgets.Section; +import org.eclipse.ui.part.ViewPart; + +import edu.usc.cssl.tacit.crawlers.gutenberg.ui.internal.IGutenbergCrawlerViewConstants; +import edu.usc.cssl.tacit.crawlers.gutenberg.ui.internal.GutenbergCrawlerViewImageRegistry; +import edu.usc.cssl.tacit.common.Preprocessor; +import edu.usc.cssl.tacit.common.ui.composite.from.TacitFormComposite; +import edu.usc.cssl.tacit.common.ui.corpusmanagement.services.CorpusClass; +import edu.usc.cssl.tacit.common.ui.internal.TargetLocationsGroup; +import edu.usc.cssl.tacit.common.ui.outputdata.TableLayoutData; +import edu.usc.cssl.tacit.common.ui.views.ConsoleView; +import edu.usc.cssl.tacit.crawlers.gutenberg.services.GutenbergConstants; +import edu.usc.cssl.tacit.crawlers.gutenberg.services.*; +import edu.usc.cssl.tacit.common.ui.corpusmanagement.services.Corpus; +import edu.usc.cssl.tacit.common.ui.corpusmanagement.services.CMDataType; +import edu.usc.cssl.tacit.common.ui.corpusmanagement.services.CorpusClass; +import edu.usc.cssl.tacit.common.ui.corpusmanagement.services.ManageCorpora; + + + + +/** + * Naive Bayes Classifier View + */ +public class GutenbergCrawlerView extends ViewPart { + public static String ID = "edu.usc.cssl.tacit.crawlers.gutenberg.ui.view1"; + + private ScrolledForm form; + private FormToolkit toolkit; + private TableLayoutData classLayoutData; + + private Composite searchComposite; + private Button searchButton; + private Button MPButton; + private Text keywordSearchText; + + private Button bothButton; + private Button commonsButton; + private Button lordsButton; + private Button domainButton; + + Combo domainList; + private Table subdomainTable; + private Button addSubdomainBtn; + private Button removeSubdomainButton; + + private Button checkPages; + private Text pageText; + private Text corpusNameTxt; + private ElementListSelectionDialog listDialog; + + private List selectedRepresentatives; + + // Classification parameters + + private Text outputPath; + + + + private boolean canProceed = false; + + protected Job job; + + private boolean checkType = true; + boolean breakFlag = false; + + private boolean isDomain = false; + private boolean isSearch = false; + private boolean isLatest = false; + + final String[] domains = new String[]{"Animals","Children","Classics","Countries","Crime","Knowledge","Fiction","Fine Arts","General Works","Geography","History","Language and Literature","Law","Music","Periodicals","Psychology and Philosophy","Religion","Science","Social Sciences","Technology","Wars"}; + //final String[] domains = new String[]{"Animals","Children","Countries","Crime","Knowledge"}; + //final String[] domains = new String[]{"Animals","Children","Countries"}; + + public org.eclipse.swt.graphics.Image getTitleImage() { + return GutenbergCrawlerViewImageRegistry.getImageIconFactory().getImage(IGutenbergCrawlerViewConstants.IMAGE_GUTENBERG_OBJ); + } + @Override + public void createPartControl(Composite parent) { + // Creates toolkit and form + toolkit = createFormBodySection(parent, "GUTENBERG CRAWLER"); + Section section = toolkit.createSection(form.getBody(), Section.TITLE_BAR | Section.EXPANDED); + GridDataFactory.fillDefaults().grab(true, false).span(3, 1).applyTo(section); + section.setExpanded(true); + + // Create a composite to hold the other widgets + ScrolledComposite sc = new ScrolledComposite(section, SWT.H_SCROLL | SWT.V_SCROLL); + sc.setExpandHorizontal(true); + sc.setExpandVertical(true); + GridLayoutFactory.fillDefaults().numColumns(3).equalWidth(false).applyTo(sc); + + // Creates an empty to create a empty space + TacitFormComposite.createEmptyRow(toolkit, sc); + + // Create a composite that can hold the other widgets + Composite client = toolkit.createComposite(form.getBody()); + GridLayoutFactory.fillDefaults().equalWidth(true).numColumns(1).applyTo(client); + GridDataFactory.fillDefaults().grab(true, false).span(1, 1).applyTo(client); + GridLayout layout = new GridLayout();// Layout creation + layout.numColumns = 2; + + createCrawlInputParameters(toolkit, client); + // Create table layout to hold the input data + /* + classLayoutData = TacitFormComposite.createTableSection(client, toolkit, layout, "Input Details", + "Add Folder(s) or Corpus Classes to include in analysis.", true, false, true, true); + */ + + + // Add run and help button on the toolbar + addButtonsToToolBar(); + form.setImage(GutenbergCrawlerViewImageRegistry.getImageIconFactory().getImage(IGutenbergCrawlerViewConstants.IMAGE_GUTENBERG_OBJ)); + + } + + /** + * Opens a "Browse" dialog + * + * @param browseBtn + * @return + */ + protected String openBrowseDialog(Button browseBtn) { + DirectoryDialog dlg = new DirectoryDialog(browseBtn.getShell(), SWT.OPEN); + dlg.setText("Open"); + String path = dlg.open(); + return path; + } + + + /** + * Checks to ensure read permission of the given location + * + * @param location + * - Directory path + * @return + */ + public String validateInputDirectory(String location) { + File locationFile = new File(location); + if (locationFile.canRead()) { + return null; + } else { + return "Classification Input Path : Permission Denied"; + } + } + + /** + * Checks to ensure read permission of the given location + * + * @param location + * - path + * @return + */ + public String validateOutputDirectory(String location) { + File locationFile = new File(location); + if (locationFile.canWrite()) { + return null; + } else { + return "Output Path : Permission Denied"; + } + } + + /** + * Validation for "Output path" + * + * @param outputText + * @param errorMessage + * - error message to be displayed if required + * @return + */ + protected boolean outputPathListener(Text outputText, String errorMessage) { + if (outputText.getText().isEmpty()) { + form.getMessageManager().addMessage("outputPath", errorMessage, null, IMessageProvider.ERROR); + return false; + } + File tempFile = new File(outputText.getText()); + if (!tempFile.exists() || !tempFile.isDirectory()) { + form.getMessageManager().addMessage("outputPath", errorMessage, null, IMessageProvider.ERROR); + return false; + } else { + form.getMessageManager().removeMessage("outputPath"); + String message = validateOutputDirectory(outputText.getText().toString()); + if (null != message) { + form.getMessageManager().addMessage("outputPath", message, null, IMessageProvider.ERROR); + return false; + } + } + return true; + } + + /*Creates the input parameters for the crawler + */ + + private void createCrawlInputParameters(final FormToolkit toolkit, final Composite parent) { + + Section inputParamsSection = toolkit.createSection(parent, Section.TITLE_BAR | Section.EXPANDED | Section.DESCRIPTION); + GridDataFactory.fillDefaults().grab(true, false).span(1, 1).applyTo(inputParamsSection); + GridLayoutFactory.fillDefaults().numColumns(4).applyTo(inputParamsSection); + inputParamsSection.setText("Input Details"); + + ScrolledComposite sc = new ScrolledComposite(inputParamsSection, SWT.H_SCROLL | SWT.V_SCROLL | SWT.BORDER); + sc.setExpandHorizontal(true); + sc.setExpandVertical(true); + GridLayoutFactory.fillDefaults().numColumns(2).equalWidth(false).applyTo(sc); + + Composite mainComposite = toolkit.createComposite(inputParamsSection); + sc.setContent(mainComposite); + GridDataFactory.fillDefaults().grab(true, true).applyTo(sc); + GridLayoutFactory.fillDefaults().numColumns(2).equalWidth(false).applyTo(mainComposite); + inputParamsSection.setClient(mainComposite); + + searchComposite = toolkit.createComposite(mainComposite); + GridDataFactory.fillDefaults().grab(true, false).span(4, 0).applyTo(searchComposite); + GridLayoutFactory.fillDefaults().numColumns(1).equalWidth(false).applyTo(searchComposite); + + Group searchGroup = new Group(searchComposite, SWT.NONE); + GridDataFactory.fillDefaults().grab(true, false).span(1, 0).applyTo(searchGroup); + GridLayoutFactory.fillDefaults().numColumns(3).equalWidth(false).applyTo(searchGroup); + searchGroup.setText("Search type:"); + + searchButton = new Button(searchGroup, SWT.RADIO); + searchButton.setText("Popular Search"); + GridDataFactory.fillDefaults().grab(false, false).indent(4,10).span(1, 0).applyTo(searchButton); + searchButton.setSelection(true); + + MPButton = new Button(searchGroup, SWT.RADIO); + MPButton.setText("Latest Search"); + GridDataFactory.fillDefaults().grab(false, false).indent(4,10).span(1, 0).applyTo(MPButton); + MPButton.setSelection(false); + + domainButton = new Button(searchGroup, SWT.RADIO); + domainButton.setText("Domain and Sub Domain Search"); + GridDataFactory.fillDefaults().grab(false, false).indent(4,10).span(1, 0).applyTo(domainButton); + domainButton.setSelection(false); + + /* + //******************************** + Group tryComposite = new Group(mainComposite, SWT.NONE); + GridLayoutFactory.fillDefaults().numColumns(3).equalWidth(true).applyTo(tryComposite); + GridDataFactory.fillDefaults().grab(true, false).span(4, 0).indent(0,20).applyTo(tryComposite); + tryComposite.setText("Keyword search:"); + + searchButton = new Button(tryComposite, SWT.RADIO); + searchButton.setText("Popular Search"); + GridDataFactory.fillDefaults().grab(false, false).indent(4,10).span(1, 0).applyTo(searchButton); + searchButton.setSelection(true); + + MPButton = new Button(tryComposite, SWT.RADIO); + MPButton.setText("Latest Search"); + GridDataFactory.fillDefaults().grab(false, false).indent(4,10).span(1, 0).applyTo(MPButton); + MPButton.setSelection(false); + + domainButton = new Button(tryComposite, SWT.RADIO); + domainButton.setText("Domain and Sub Domain Search"); + GridDataFactory.fillDefaults().grab(false, false).indent(4,10).span(1, 0).applyTo(domainButton); + domainButton.setSelection(false); + //******************************** + * + */ + + Group searchFilterComposite = new Group(mainComposite, SWT.NONE); + GridLayoutFactory.fillDefaults().numColumns(3).equalWidth(true).applyTo(searchFilterComposite); + GridDataFactory.fillDefaults().grab(true, false).span(4, 0).indent(0,20).applyTo(searchFilterComposite); + searchFilterComposite.setText("Keyword search:"); + + final Label searchLabel = new Label(searchFilterComposite, SWT.NONE); + searchLabel.setText("Keyword:"); + GridDataFactory.fillDefaults().grab(false, false).indent(4,10).span(1, 0).applyTo(searchLabel); + + keywordSearchText = new Text(searchFilterComposite, SWT.BORDER); + GridDataFactory.fillDefaults().grab(true, false).indent(0,10).span(2, 0).applyTo(keywordSearchText); + keywordSearchText.setMessage("Enter a search term"); + + final Group domainFilterComposite = new Group(mainComposite, SWT.SHADOW_IN); + GridDataFactory.fillDefaults().grab(true, false).span(4, 0).applyTo(domainFilterComposite); + domainFilterComposite.setText("Select Domain and Sub Domain:"); + GridLayoutFactory.fillDefaults().numColumns(3).applyTo(domainFilterComposite); + + Label domain = new Label(domainFilterComposite, SWT.NONE); + domain.setText("Select Domain:"); + GridDataFactory.fillDefaults().grab(false, false).span(1, 0).applyTo(domain); + + domainList = new Combo(domainFilterComposite, SWT.FLAT | SWT.READ_ONLY); + GridDataFactory.fillDefaults().grab(true, false).span(2, 0).applyTo(domainList); + toolkit.adapt(domainList); + domainList.setItems(domains); + domainList.select(0); + domainList.setEnabled(false); + + Label sortType = new Label(domainFilterComposite, SWT.NONE); + sortType.setText("Select sub-domains:"); + subdomainTable = new Table(domainFilterComposite, SWT.BORDER | SWT.MULTI); + GridDataFactory.fillDefaults().grab(true, true).span(1, 3).hint(90, 50).applyTo(subdomainTable); + subdomainTable.setEnabled(false); + + Composite buttonComp = new Composite(domainFilterComposite, SWT.NONE); + GridLayout btnLayout = new GridLayout(); + btnLayout.marginWidth = btnLayout.marginHeight = 0; + btnLayout.makeColumnsEqualWidth = false; + buttonComp.setLayout(btnLayout); + buttonComp.setLayoutData(new GridData(GridData.FILL_VERTICAL)); + + addSubdomainBtn = new Button(buttonComp, SWT.PUSH); // $NON-NLS-1$ + addSubdomainBtn.setText("Add..."); + GridDataFactory.fillDefaults().grab(false, false).span(1, 1).applyTo(addSubdomainBtn); + addSubdomainBtn.setEnabled(false); + + addSubdomainBtn.addSelectionListener(new SelectionAdapter() { + @Override + public void widgetSelected(SelectionEvent e) { + + ILabelProvider lp = new ArrayLabelProvider(); + listDialog = new ElementListSelectionDialog(addSubdomainBtn.getShell(), lp); + listDialog.setTitle("Select domain"); + listDialog.setMessage("Type the name of the domain"); + listDialog.setMultipleSelection(true); + listDialog.setElements(GutenbergConstants.sites.get(domainList.getSelectionIndex())); + if (listDialog.open() == Window.OK) { + updateTable(listDialog.getResult()); + } + } + + }); + + removeSubdomainButton = new Button(buttonComp, SWT.PUSH); + removeSubdomainButton.setText("Remove..."); + GridDataFactory.fillDefaults().grab(false, false).span(1, 1).applyTo(removeSubdomainButton); + removeSubdomainButton.setEnabled(false); + + removeSubdomainButton.addSelectionListener(new SelectionAdapter() { + @Override + public void widgetSelected(SelectionEvent e) { + for (TableItem item : subdomainTable.getSelection()) { + selectedRepresentatives.remove(item.getText()); + item.dispose(); + } + if (selectedRepresentatives.size() == 0) { + removeSubdomainButton.setEnabled(false); + } + } + }); + + Group limitGroup = new Group(mainComposite, SWT.SHADOW_IN); + GridDataFactory.fillDefaults().grab(true, false).span(4, 0).applyTo(limitGroup); + limitGroup.setText("Filter Results"); + GridLayoutFactory.fillDefaults().numColumns(3).applyTo(limitGroup); + + final Composite limitClient = new Composite(limitGroup, SWT.None); + GridDataFactory.fillDefaults().grab(true, false).span(1, 1).indent(10, 10).applyTo(limitClient); + GridLayoutFactory.fillDefaults().numColumns(2).equalWidth(false).applyTo(limitClient); + + checkPages = new Button(limitClient, SWT.CHECK); + GridDataFactory.fillDefaults().grab(true, false).span(2, 0).applyTo(checkPages); + checkPages.setText("Limit Pages"); + + Label limitPages = new Label(limitClient, SWT.NONE); + limitPages.setText("Limit records per sub-domains:"); + GridDataFactory.fillDefaults().grab(false, false).span(1, 0).applyTo(limitPages); + pageText = new Text(limitClient, SWT.BORDER); + GridDataFactory.fillDefaults().grab(true, false).span(1, 0).applyTo(pageText); + pageText.setEnabled(false); + + checkPages.addSelectionListener(new SelectionListener() { + @Override + public void widgetSelected(SelectionEvent e) { + if(checkPages.getSelection()) + pageText.setEnabled(true); + else + pageText.setEnabled(false); + } + + @Override + public void widgetDefaultSelected(SelectionEvent e) { + // TODO Auto-generated method stub + + } + }); + + searchButton.addSelectionListener(new SelectionListener() { + @Override + public void widgetSelected(SelectionEvent e) { + if(searchButton.getSelection()) + { + keywordSearchText.setEnabled(true); + domainList.setEnabled(false); + subdomainTable.setEnabled(false); + + } + else + { + keywordSearchText.setEnabled(false); + } + } + + @Override + public void widgetDefaultSelected(SelectionEvent e) { + // TODO Auto-generated method stub + + } + }); + + MPButton.addSelectionListener(new SelectionListener() { + @Override + public void widgetSelected(SelectionEvent e) { + if(MPButton.getSelection()) + { + keywordSearchText.setEnabled(false); + domainList.setEnabled(false); + subdomainTable.setEnabled(false); + + } + } + + @Override + public void widgetDefaultSelected(SelectionEvent e) { + // TODO Auto-generated method stub + + } + }); + + domainButton.addSelectionListener(new SelectionListener() { + @Override + public void widgetSelected(SelectionEvent e) { + if(domainButton.getSelection()) + { + domainList.setEnabled(true); + subdomainTable.setEnabled(true); + addSubdomainBtn.setEnabled(true); + removeSubdomainButton.setEnabled(true); + } + } + + @Override + public void widgetDefaultSelected(SelectionEvent e) { + // TODO Auto-generated method stub + + } + }); + + + TacitFormComposite.createEmptyRow(toolkit, limitGroup); + + Composite client = toolkit.createComposite(form.getBody()); + GridLayoutFactory.fillDefaults().equalWidth(true).numColumns(1).applyTo(client); // Align + // the + // composite + // section + // to + // one + // column + GridDataFactory.fillDefaults().grab(true, false).span(1, 1).applyTo(client); + + TacitFormComposite.createEmptyRow(toolkit, client); + corpusNameTxt = TacitFormComposite.createCorpusSection(toolkit, client, form.getMessageManager()); + TacitFormComposite.createEmptyRow(toolkit, client); + Button btnRun = TacitFormComposite.createRunButton(client, toolkit); + + + btnRun.addSelectionListener(new SelectionListener() { + + @Override + public void widgetSelected(SelectionEvent e) { + final Job job = new Job("Gutenberg Crawler") { + String outputDir; + String corpusName; + Corpus corpus; + int pages; + boolean canProceed; + String query; + @Override + protected IStatus run(IProgressMonitor monitor) { + + TacitFormComposite.setConsoleViewInFocus(); + TacitFormComposite.updateStatusMessage(getViewSite(), null, null, form); + Display.getDefault().syncExec(new Runnable() { + @Override + public void run() { + if(checkPages.getSelection()) + pages = Integer.parseInt(pageText.getText()); + else + pages =-1; + corpusName = corpusNameTxt.getText(); + isDomain = domainButton.getSelection(); + isSearch = searchButton.getSelection(); + isLatest = MPButton.getSelection(); + query = keywordSearchText.getText(); + outputDir = IGutenbergCrawlerViewConstants.DEFAULT_CORPUS_LOCATION + File.separator+ corpusName.trim(); + if (!new File(outputDir).exists()) { + new File(outputDir).mkdirs(); + } + } + }); + + int progressSize = 30; + monitor.beginTask("Running Gutenberg Crawler...", progressSize); + TacitFormComposite.writeConsoleHeaderBegining("Gutenberg Crawler started"); + GutenbergMain objmain = new GutenbergMain(); + SearchLatest objlatest = new SearchLatest(); + SearchPopular objpopular = new SearchPopular(); + monitor.subTask("Initializing..."); + monitor.worked(10); + if (monitor.isCanceled()) + handledCancelRequest("Crawling is Stopped"); + corpus = new Corpus(corpusName, CMDataType.GUTENBERG_JSON); + if(isDomain){ + for (final String domain : selectedRepresentatives) { + outputDir = IGutenbergCrawlerViewConstants.DEFAULT_CORPUS_LOCATION + File.separator + corpusName; + outputDir += File.separator + domain; + if (!new File(outputDir).exists()) { + new File(outputDir).mkdirs(); + } + + try { + monitor.subTask("Crawling..."); + if (monitor.isCanceled()) + return handledCancelRequest("Crawling is Stopped"); + objmain.crawl(outputDir, domain, pages, monitor); + if (monitor.isCanceled()) + return handledCancelRequest("Crawling is Stopped"); + } catch (Exception e) { + return handleException(monitor, e, "Crawling failed. Provide valid data"); + } + try { + Display.getDefault().syncExec(new Runnable() { + + @Override + public void run() { + + CorpusClass cc = new CorpusClass(domain, outputDir); + cc.setParent(corpus); + corpus.addClass(cc); + + } + }); + } catch (Exception e) { + e.printStackTrace(); + return Status.CANCEL_STATUS; + } + } + } + if(isSearch) + { + System.out.println("I am inside search button"); + outputDir = IGutenbergCrawlerViewConstants.DEFAULT_CORPUS_LOCATION + File.separator + corpusName; + outputDir += File.separator + query; + if (!new File(outputDir).exists()) { + new File(outputDir).mkdirs(); + } + + try { + monitor.subTask("Crawling..."); + if (monitor.isCanceled()) + return handledCancelRequest("Crawling is Stopped"); + objpopular.popular(outputDir,pages,query, monitor); + if (monitor.isCanceled()) + return handledCancelRequest("Crawling is Stopped"); + } catch (Exception e) { + return handleException(monitor, e, "Crawling failed. Provide valid data"); + } + try { + Display.getDefault().syncExec(new Runnable() { + + @Override + public void run() { + + CorpusClass cc = new CorpusClass(query, outputDir); + cc.setParent(corpus); + corpus.addClass(cc); + + } + }); + } catch (Exception e) { + e.printStackTrace(); + return Status.CANCEL_STATUS; + } + + + } + if(isLatest) + { + + System.out.println("I am inside latest button"); + outputDir = IGutenbergCrawlerViewConstants.DEFAULT_CORPUS_LOCATION + File.separator + corpusName; + outputDir += File.separator + "latest"; + if (!new File(outputDir).exists()) { + new File(outputDir).mkdirs(); + } + + try { + monitor.subTask("Crawling..."); + if (monitor.isCanceled()) + return handledCancelRequest("Crawling is Stopped"); + objlatest.latest(outputDir,pages, monitor); + if (monitor.isCanceled()) + return handledCancelRequest("Crawling is Stopped"); + } catch (Exception e) { + return handleException(monitor, e, "Crawling failed. Provide valid data"); + } + try { + Display.getDefault().syncExec(new Runnable() { + + @Override + public void run() { + + CorpusClass cc = new CorpusClass("latest", outputDir); + cc.setParent(corpus); + corpus.addClass(cc); + + } + }); + } catch (Exception e) { + e.printStackTrace(); + return Status.CANCEL_STATUS; + } + + + + } + ManageCorpora.saveCorpus(corpus); + if (monitor.isCanceled()) + return handledCancelRequest("Crawling is Stopped"); + ConsoleView.printlInConsoleln("Created Corpus: "+corpusName); + monitor.worked(100); + monitor.done(); + return Status.OK_STATUS; + + } + }; + job.setUser(true); + boolean canProceed = canItProceed(); + if (canProceed) { + job.schedule(); // schedule the job + job.addJobChangeListener(new JobChangeAdapter() { + + public void done(IJobChangeEvent event) { + if (!event.getResult().isOK()) { + TacitFormComposite + .writeConsoleHeaderBegining("Error: Gutenberg Crawler "); + TacitFormComposite.updateStatusMessage(getViewSite(), "Crawling is stopped", + IStatus.INFO, form); + + } else { + TacitFormComposite.updateStatusMessage(getViewSite(), + "Gutenberg Crawler completed", IStatus.OK, form); + ConsoleView.printlInConsoleln("Gutenberg Crawler completed successfully."); + TacitFormComposite + .writeConsoleHeaderBegining("Success: Gutenberg Crawler "); + + } + } + }); + } + + } + + @Override + public void widgetDefaultSelected(SelectionEvent e) { + // TODO Auto-generated method stub + + } + }); + } + + static class ArrayLabelProvider extends LabelProvider { + @Override + public String getText(Object element) { + return (String) element; + } + } + + public void updateTable(Object[] result) { + if (selectedRepresentatives == null) { + selectedRepresentatives = new ArrayList(); + } + + for (Object object : result) { + if (!selectedRepresentatives.contains((String) object)) + selectedRepresentatives.add((String) object); + } + + subdomainTable.removeAll(); + for (String itemName : selectedRepresentatives) { + TableItem item = new TableItem(subdomainTable, 0); + item.setText(itemName); + if (!removeSubdomainButton.isEnabled()) { + removeSubdomainButton.setEnabled(true); + } + } + + } + + + /** + * Adds "Classify" and "Help" buttons on the Naive Bayes Classifier form + */ + private void addButtonsToToolBar() { + IToolBarManager mgr = form.getToolBarManager(); + mgr.add(new Action() { + @Override + public ImageDescriptor getImageDescriptor() { + return (GutenbergCrawlerViewImageRegistry.getImageIconFactory().getImageDescriptor(IGutenbergCrawlerViewConstants.IMAGE_LRUN_OBJ)); + } + + @Override + public String getToolTipText() { + return "Crawl"; + } + + String outputDir; + String corpusName; + Corpus corpus; + int pages; + boolean canProceed; + String query; + + @Override + public void run() { + TacitFormComposite.writeConsoleHeaderBegining("Gutenberg Crawler started"); + TacitFormComposite.updateStatusMessage(getViewSite(), null, null, form); + job = new Job("Gutenberg Crawler") { + @Override + protected IStatus run(final IProgressMonitor monitor) { + TacitFormComposite.setConsoleViewInFocus(); + TacitFormComposite.updateStatusMessage(getViewSite(), null, null, form); + monitor.beginTask("Running Gutenberg Crawler...", 100); + Date dateObj = new Date(); + Display.getDefault().syncExec(new Runnable() { + @Override + public void run() { + if(checkPages.getSelection()) + pages = Integer.parseInt(pageText.getText()); + else + pages =-1; + corpusName = corpusNameTxt.getText(); + isDomain = domainButton.getSelection(); + isSearch = searchButton.getSelection(); + isLatest = MPButton.getSelection(); + query = keywordSearchText.getText(); + outputDir = IGutenbergCrawlerViewConstants.DEFAULT_CORPUS_LOCATION + File.separator+ corpusName.trim(); + if (!new File(outputDir).exists()) { + new File(outputDir).mkdirs(); + } + } + }); + //int progressSize = 0; + //if(domainButton.getSelection()) + //{ + //progressSize =selectedRepresentatives.size()*pages + 30; + //} + int progressSize = 30; + monitor.beginTask("Running Gutenberg Crawler...", progressSize); + TacitFormComposite.writeConsoleHeaderBegining("Gutenberg Crawler started"); + GutenbergMain objmain = new GutenbergMain(); + SearchLatest objlatest = new SearchLatest(); + SearchPopular objpopular = new SearchPopular(); + monitor.subTask("Initializing..."); + monitor.worked(10); + if (monitor.isCanceled()) + { + handledCancelRequest("Crawling is Stopped"); + } + corpus = new Corpus(corpusName, CMDataType.GUTENBERG_JSON); + System.out.println("Name of corpus=============" + corpus); + if(isDomain) + { + System.out.println("I am inside domain button"); + for (final String domain : selectedRepresentatives) { + System.out.println("Selected Representatives&&&&&&&&&&&&&&&&=" + selectedRepresentatives); + outputDir = IGutenbergCrawlerViewConstants.DEFAULT_CORPUS_LOCATION + File.separator + corpusName; + outputDir += File.separator + domain; + if (!new File(outputDir).exists()) { + new File(outputDir).mkdirs(); + } + + try { + monitor.subTask("Crawling..."); + if (monitor.isCanceled()) + return handledCancelRequest("Crawling is Stopped"); + objmain.crawl(outputDir, domain, pages, monitor); + if (monitor.isCanceled()) + return handledCancelRequest("Crawling is Stopped"); + } catch (Exception e) { + return handleException(monitor, e, "Crawling failed. Provide valid data"); + } + try { + Display.getDefault().syncExec(new Runnable() { + + @Override + public void run() { + + CorpusClass cc = new CorpusClass(domain, outputDir); + cc.setParent(corpus); + corpus.addClass(cc); + + } + }); + } catch (Exception e) { + e.printStackTrace(); + return Status.CANCEL_STATUS; + } + } + } + if(isSearch) + { + System.out.println("I am inside search button"); + outputDir = IGutenbergCrawlerViewConstants.DEFAULT_CORPUS_LOCATION + File.separator + corpusName; + outputDir += File.separator + query; + if (!new File(outputDir).exists()) { + new File(outputDir).mkdirs(); + } + + try { + monitor.subTask("Crawling..."); + if (monitor.isCanceled()) + return handledCancelRequest("Crawling is Stopped"); + objpopular.popular(outputDir,pages,query, monitor); + if (monitor.isCanceled()) + return handledCancelRequest("Crawling is Stopped"); + } catch (Exception e) { + return handleException(monitor, e, "Crawling failed. Provide valid data"); + } + try { + Display.getDefault().syncExec(new Runnable() { + + @Override + public void run() { + + CorpusClass cc = new CorpusClass(query, outputDir); + cc.setParent(corpus); + corpus.addClass(cc); + + } + }); + } catch (Exception e) { + e.printStackTrace(); + return Status.CANCEL_STATUS; + } + + + } + if(isLatest) + { + + System.out.println("I am inside latest button"); + outputDir = IGutenbergCrawlerViewConstants.DEFAULT_CORPUS_LOCATION + File.separator + corpusName; + outputDir += File.separator + "latest"; + if (!new File(outputDir).exists()) { + new File(outputDir).mkdirs(); + } + + try { + monitor.subTask("Crawling..."); + if (monitor.isCanceled()) + return handledCancelRequest("Crawling is Stopped"); + objlatest.latest(outputDir,pages, monitor); + if (monitor.isCanceled()) + return handledCancelRequest("Crawling is Stopped"); + } catch (Exception e) { + return handleException(monitor, e, "Crawling failed. Provide valid data"); + } + try { + Display.getDefault().syncExec(new Runnable() { + + @Override + public void run() { + + CorpusClass cc = new CorpusClass("latest", outputDir); + cc.setParent(corpus); + corpus.addClass(cc); + + } + }); + } catch (Exception e) { + e.printStackTrace(); + return Status.CANCEL_STATUS; + } + + + + } + ManageCorpora.saveCorpus(corpus); + if (monitor.isCanceled()) + return handledCancelRequest("Crawling is Stopped"); + ConsoleView.printlInConsoleln("Created Corpus: "+corpusName); + monitor.worked(100); + monitor.done(); + return Status.OK_STATUS; + } + }; + job.setUser(true); + canProceed = canItProceed(); + if (canProceed) { + job.schedule(); // schedule the job + job.addJobChangeListener(new JobChangeAdapter() { + public void done(IJobChangeEvent event) { + if (!event.getResult().isOK()) { + TacitFormComposite + .writeConsoleHeaderBegining("Error: Gutenberg Crawler"); + } else { + TacitFormComposite.updateStatusMessage(getViewSite(), + "Gutenberg Crawler completed", IStatus.OK, form); + ConsoleView.printlInConsoleln("Gutenberg Crawler completed successfully."); + TacitFormComposite + .writeConsoleHeaderBegining("Success: Gutenberg Crawler "); + + } + } + }); + } + }; + + + + + }); + + Action helpAction = new Action() { + @Override + public ImageDescriptor getImageDescriptor() { + return (GutenbergCrawlerViewImageRegistry.getImageIconFactory().getImageDescriptor(IGutenbergCrawlerViewConstants.IMAGE_HELP_CO)); + } + + @Override + public String getToolTipText() { + return "Help"; + } + + @Override + public void run() { + PlatformUI.getWorkbench().getHelpSystem() + .displayHelp("edu.usc.cssl.tacit.classify.naivebayes.ui.naivebayes"); + }; + }; + mgr.add(helpAction); + PlatformUI.getWorkbench().getHelpSystem().setHelp(helpAction, + "edu.usc.cssl.tacit.classify.naivebayes.ui.naivebayes"); + PlatformUI.getWorkbench().getHelpSystem().setHelp(form, "edu.usc.cssl.tacit.classify.naivebayes.ui.naivebayes"); + form.getToolBarManager().update(true); + } + + /** + * Handles cancel request by sending appropriate message to UI + * + * @param message + * @return + */ + private IStatus handledCancelRequest(String message) { + TacitFormComposite.updateStatusMessage(getViewSite(), message, IStatus.ERROR, form); + ConsoleView.printlInConsoleln("Gutenberg Crawler cancelled."); + return Status.CANCEL_STATUS; + + } + + /** + * Validates the input form to ensure correctness + * + * @param classPaths + * @return + */ + private boolean canItProceed() { + + form.getMessageManager().removeAllMessages(); + Boolean isDomaincheck; + Boolean isSearchcheck; + + isDomaincheck = domainButton.getSelection(); + System.out.println("value of ------------------------" + isDomaincheck); + if(isDomaincheck) + { + System.out.println("I am inside"); + try{ + if(selectedRepresentatives.isEmpty()){ + form.getMessageManager().addMessage("DomainError", "Enter atleast one sub domain name", null, + IMessageProvider.ERROR); + return false; + }else{ + form.getMessageManager().removeMessage("DomainError"); + } + }catch(Exception e){ + form.getMessageManager().addMessage("DomainError", "Enter atleast one sub domain name", null, + IMessageProvider.ERROR); + return false; + } + } + isSearchcheck = searchButton.getSelection(); + if(isSearchcheck) + { + try{ + String query = keywordSearchText.getText(); + if(query == null || query.isEmpty()) + { + form.getMessageManager().addMessage("keyword", "Enter the keyword to be crawled", null, + IMessageProvider.ERROR); + return false; + }else + form.getMessageManager().removeMessage("pageLimit"); + }catch (Exception e) { + form.getMessageManager().addMessage("keyword", "Enter the keyword to be crawled", null, + IMessageProvider.ERROR); + return false; + } + } + try { + int pages = Integer.parseInt(pageText.getText()); + if (pages < 1) { + form.getMessageManager().addMessage("pageLimit", "Enter the number of pages to be crawled", null, + IMessageProvider.ERROR); + return false; + } else + form.getMessageManager().removeMessage("pageLimit"); + } catch (Exception e) { + form.getMessageManager().addMessage("pageLimit", "Enter the number of pages to be crawled", null, + IMessageProvider.ERROR); + return false; + } + + // Validate corpus name + String corpusName = corpusNameTxt.getText(); + if (null == corpusName || corpusName.isEmpty()) { + form.getMessageManager().addMessage("corpusName", "Provide corpus name", null, IMessageProvider.ERROR); + return false; + } else { + String outputDir = IGutenbergCrawlerViewConstants.DEFAULT_CORPUS_LOCATION + File.separator + corpusName; + if (new File(outputDir).exists()) { + form.getMessageManager().addMessage("corpusName", "Corpus already exists", null, + IMessageProvider.ERROR); + return false; + } else { + form.getMessageManager().removeMessage("corpusName"); + return true; + } + } + + + } + + /** + * Maps each class to its selected files + * + * @param classLayoutData + * @param classPaths + */ + protected void consolidateSelectedFiles(TableLayoutData classLayoutData, Map> classPaths) { + Tree tree = classLayoutData.getTree(); + for (int i = 0; i < tree.getItemCount(); i++) { + TreeItem temp = tree.getItem(i); + if (temp.getChecked()) { + classPaths.put(temp.getData().toString(), classLayoutData.getSelectedItems(temp)); + } + } + } + + /** + * Function to be called incase of exception + * + * @param monitor + * @param e + * @param message + * @return + */ + private IStatus handleException(IProgressMonitor monitor, Exception e, String message) { + monitor.done(); + System.out.println(message); + e.printStackTrace(); + TacitFormComposite.updateStatusMessage(getViewSite(), message + e.getMessage(), IStatus.ERROR, form); + return Status.CANCEL_STATUS; + } + + @Override + public void setFocus() { + form.setFocus(); + } + + /** + * Output file creation with statistics + * + * @param location + * @param title + * @param dateObj + * @param perf + * @param kValue + * @param monitor + */ + + + /** + * + * @param parent + * @param title + * @return - Creates a form body section for Naive Bayes Classifier + */ + private FormToolkit createFormBodySection(Composite parent, String title) { + // Every interface requires a toolkit(Display) and form to store the + // components + FormToolkit toolkit = new FormToolkit(parent.getDisplay()); + form = toolkit.createScrolledForm(parent); + toolkit.decorateFormHeading(form.getForm()); + form.setText(title); + GridLayoutFactory.fillDefaults().numColumns(1).equalWidth(true).applyTo(form.getBody()); + return toolkit; + } + +} diff --git a/edu.usc.cssl.tacit.crawlers.gutenberg.ui/src/edu/usc/cssl/tacit/crawlers/gutenberg/ui/handlers/GutenbergCrawlerViewHandler.java b/edu.usc.cssl.tacit.crawlers.gutenberg.ui/src/edu/usc/cssl/tacit/crawlers/gutenberg/ui/handlers/GutenbergCrawlerViewHandler.java new file mode 100644 index 00000000..ce83a31d --- /dev/null +++ b/edu.usc.cssl.tacit.crawlers.gutenberg.ui/src/edu/usc/cssl/tacit/crawlers/gutenberg/ui/handlers/GutenbergCrawlerViewHandler.java @@ -0,0 +1,24 @@ +package edu.usc.cssl.tacit.crawlers.gutenberg.ui.handlers; + +import org.eclipse.core.commands.AbstractHandler; +import org.eclipse.core.commands.ExecutionEvent; +import org.eclipse.core.commands.ExecutionException; +import org.eclipse.ui.PartInitException; +import org.eclipse.ui.handlers.HandlerUtil; + +import edu.usc.cssl.tacit.crawlers.gutenberg.ui.GutenbergCrawlerView; + +public class GutenbergCrawlerViewHandler extends AbstractHandler{ + + @Override + public Object execute(ExecutionEvent event) throws ExecutionException { + try { + HandlerUtil.getActiveWorkbenchWindowChecked(event). + getActivePage().showView(GutenbergCrawlerView.ID); + + } catch (PartInitException e) { + e.printStackTrace(); + } + return null; + } +} diff --git a/edu.usc.cssl.tacit.crawlers.gutenberg.ui/src/edu/usc/cssl/tacit/crawlers/gutenberg/ui/internal/GutenbergCrawlerViewImageRegistry.java b/edu.usc.cssl.tacit.crawlers.gutenberg.ui/src/edu/usc/cssl/tacit/crawlers/gutenberg/ui/internal/GutenbergCrawlerViewImageRegistry.java new file mode 100644 index 00000000..330eb8ab --- /dev/null +++ b/edu.usc.cssl.tacit.crawlers.gutenberg.ui/src/edu/usc/cssl/tacit/crawlers/gutenberg/ui/internal/GutenbergCrawlerViewImageRegistry.java @@ -0,0 +1,46 @@ +package edu.usc.cssl.tacit.crawlers.gutenberg.ui.internal; + +import org.eclipse.jface.resource.ImageDescriptor; +import org.eclipse.jface.resource.ImageRegistry; +import org.eclipse.swt.graphics.Image; + +public class GutenbergCrawlerViewImageRegistry{ + + ImageRegistry ir = new ImageRegistry(); + static GutenbergCrawlerViewImageRegistry imgIcon; + + //Returns the descriptor associated with the given key in this registry, or null if none. + public ImageDescriptor getImageDescriptor(String key) { + return ir.getDescriptor(key); + } + + private GutenbergCrawlerViewImageRegistry(){ + + + ir.put(IGutenbergCrawlerViewConstants .IMAGE_LRUN_OBJ, ImageDescriptor + .createFromFile(GutenbergCrawlerViewImageRegistry.class, "/icons/lrun_obj.gif")); + + ir.put(IGutenbergCrawlerViewConstants .IMAGE_HELP_CO, ImageDescriptor + .createFromFile(GutenbergCrawlerViewImageRegistry.class, "/icons/help_contents.gif")); + + ir.put(IGutenbergCrawlerViewConstants .IMAGE_GUTENBERG_OBJ, ImageDescriptor + .createFromFile(GutenbergCrawlerViewImageRegistry.class, "/icons/GutenbergCrawlerIcon.png")); + } + + public static GutenbergCrawlerViewImageRegistry getImageIconFactory() { + if (imgIcon == null) { + imgIcon = new GutenbergCrawlerViewImageRegistry(); + } + return imgIcon; + + } + + + public Image getImage(String imageName) { + return ir.get(imageName); + } + +} + + +/* This file handles creation of images from the gif files we provide and allocating OS resources for image to get displayed */ \ No newline at end of file diff --git a/edu.usc.cssl.tacit.crawlers.gutenberg.ui/src/edu/usc/cssl/tacit/crawlers/gutenberg/ui/internal/IGutenbergCrawlerViewConstants.java b/edu.usc.cssl.tacit.crawlers.gutenberg.ui/src/edu/usc/cssl/tacit/crawlers/gutenberg/ui/internal/IGutenbergCrawlerViewConstants.java new file mode 100644 index 00000000..7c0b5f37 --- /dev/null +++ b/edu.usc.cssl.tacit.crawlers.gutenberg.ui/src/edu/usc/cssl/tacit/crawlers/gutenberg/ui/internal/IGutenbergCrawlerViewConstants.java @@ -0,0 +1,11 @@ +package edu.usc.cssl.tacit.crawlers.gutenberg.ui.internal; + +public interface IGutenbergCrawlerViewConstants { + public static final String IMAGE_LRUN_OBJ = "lrun_obj"; + public static final String IMAGE_HELP_CO = "help_co"; + public static final String IMAGE_FILE_OBJ = "File_obj"; + public static final String IMAGE_FOLDER_OBJ = "Foler_obj"; + public static final String IMAGE_GUTENBERG_OBJ = "gutenberg_crawler"; + String DEFAULT_CORPUS_LOCATION = System.getProperty("user.dir") + System.getProperty("file.separator") + "json_corpuses" + System.getProperty("file.separator") + "gutenberg"; +} + diff --git a/edu.usc.cssl.tacit.crawlers.gutenberg/META-INF/MANIFEST.MF b/edu.usc.cssl.tacit.crawlers.gutenberg/META-INF/MANIFEST.MF new file mode 100644 index 00000000..a1b592b1 --- /dev/null +++ b/edu.usc.cssl.tacit.crawlers.gutenberg/META-INF/MANIFEST.MF @@ -0,0 +1,15 @@ +Manifest-Version: 1.0 +Bundle-ManifestVersion: 2 +Bundle-Name: Gutenberg +Bundle-SymbolicName: edu.usc.cssl.tacit.crawlers.gutenberg +Bundle-Version: 1.0.0.qualifier +Bundle-Activator: edu.usc.cssl.tacit.crawlers.gutenberg.Activator +Require-Bundle: org.eclipse.ui, + org.eclipse.core.runtime, + org.jsoup;bundle-version="1.7.2", + edu.usc.cssl.tacit.common.ui;bundle-version="1.0.0" +Bundle-RequiredExecutionEnvironment: JavaSE-1.6 +Bundle-ActivationPolicy: lazy +Export-Package: edu.usc.cssl.tacit.crawlers.gutenberg.services +Bundle-ClassPath: jackson-core-2.5.0.jar, + . diff --git a/edu.usc.cssl.tacit.crawlers.gutenberg/build.properties b/edu.usc.cssl.tacit.crawlers.gutenberg/build.properties new file mode 100644 index 00000000..1ce3283d --- /dev/null +++ b/edu.usc.cssl.tacit.crawlers.gutenberg/build.properties @@ -0,0 +1,5 @@ +source.. = src/ +output.. = bin/ +bin.includes = META-INF/,\ + .,\ + jackson-core-2.5.0.jar diff --git a/edu.usc.cssl.tacit.crawlers.gutenberg/jackson-core-2.5.0.jar b/edu.usc.cssl.tacit.crawlers.gutenberg/jackson-core-2.5.0.jar new file mode 100644 index 00000000..e8ca122f Binary files /dev/null and b/edu.usc.cssl.tacit.crawlers.gutenberg/jackson-core-2.5.0.jar differ diff --git a/edu.usc.cssl.tacit.crawlers.gutenberg/src/edu/usc/cssl/tacit/crawlers/gutenberg/Activator.java b/edu.usc.cssl.tacit.crawlers.gutenberg/src/edu/usc/cssl/tacit/crawlers/gutenberg/Activator.java new file mode 100644 index 00000000..345ade72 --- /dev/null +++ b/edu.usc.cssl.tacit.crawlers.gutenberg/src/edu/usc/cssl/tacit/crawlers/gutenberg/Activator.java @@ -0,0 +1,50 @@ +package edu.usc.cssl.tacit.crawlers.gutenberg; + +import org.eclipse.ui.plugin.AbstractUIPlugin; +import org.osgi.framework.BundleContext; + +/** + * The activator class controls the plug-in life cycle + */ +public class Activator extends AbstractUIPlugin { + + // The plug-in ID + public static final String PLUGIN_ID = "edu.usc.cssl.tacit.crawlers.gutenberg"; //$NON-NLS-1$ + + // The shared instance + private static Activator plugin; + + /** + * The constructor + */ + public Activator() { + } + + /* + * (non-Javadoc) + * @see org.eclipse.ui.plugin.AbstractUIPlugin#start(org.osgi.framework.BundleContext) + */ + public void start(BundleContext context) throws Exception { + super.start(context); + plugin = this; + } + + /* + * (non-Javadoc) + * @see org.eclipse.ui.plugin.AbstractUIPlugin#stop(org.osgi.framework.BundleContext) + */ + public void stop(BundleContext context) throws Exception { + plugin = null; + super.stop(context); + } + + /** + * Returns the shared instance + * + * @return the shared instance + */ + public static Activator getDefault() { + return plugin; + } + +} diff --git a/edu.usc.cssl.tacit.crawlers.gutenberg/src/edu/usc/cssl/tacit/crawlers/gutenberg/services/GutenbergConstants.java b/edu.usc.cssl.tacit.crawlers.gutenberg/src/edu/usc/cssl/tacit/crawlers/gutenberg/services/GutenbergConstants.java new file mode 100644 index 00000000..faa103ad --- /dev/null +++ b/edu.usc.cssl.tacit.crawlers.gutenberg/src/edu/usc/cssl/tacit/crawlers/gutenberg/services/GutenbergConstants.java @@ -0,0 +1,538 @@ +package edu.usc.cssl.tacit.crawlers.gutenberg.services; + +import java.util.HashMap; + +public class GutenbergConstants { + public static HashMap site2Link = new HashMap(); + public static HashMap sites = new HashMap(); + static String animals[] = new String[]{"Birds", + "Insects", + "Mammals", + "Reptiles and Amphibians", + "Trapping"}; + + static String children[] = new String[]{"Anthologies", + "Biography", + "Book Series", + "Verse", + "Christmas", + "Fiction", + "History", + "Instructional Books", + "Literature", + "Myths and Fairy Tales", + "Religion", + "School Stories" + + }; + + static String classics[] = new String[]{"Classics"}; + + static String Countries[] =new String[]{"Africa", + "Argentina", + "Australia", + "Bulgaria", + "Canada", + "Czech", + "Egypt", + "France", + "Germany", + "Greece", + "India", + "Italy", + "New Zealand", + "Norway", + "South Africa", + "South America", + "Travel", + "United Kingdom", + "United States" + }; + static String Crime[] = new String[]{"Crime Fiction", + "Crime Non Fiction", + "Detective Fiction", + "Mystery Fiction" + }; + + static String Knowledge[] = new String[]{"Education", + "Language Education" + }; + + static String fiction[] = new String[]{"Adventure", + "Children's Fiction", + "Crime Fiction", + "Detective Fiction", + "Erotic Fiction", + "Fantasy", + "General Fiction", + "Gothic Fiction", + "Historical Fiction", + "Horror", + "Humor", + "Movie Books", + "Mystery Fiction", + "Precursors of Science Fiction", + "Romantic Fiction", + "School Stories", + "Science Fiction", + "Western" + }; + + static String fine_arts[] = new String[]{"Architecture", + "Art" + }; + + static String general_works[] = new String[]{"Children's Periodicals:Dew Drops", + "Children's Periodicals:The Girls Own Paper", + "Children's Periodicals:Golden Days for Boys and Girls", + "Children's Periodicals:The Great Round World And What Is Going On In It", + "Children's Periodicals:The Nursery", + "Children's Periodicals:St. Nicholas Magazine for Boys and Girls", + "Reference" + }; + + static String geography[] = new String[]{"Anthropology", + "CIA World Factbooks", + "Folklore", + "Maps and Cartography", + "Women's Travel Journals" + }; + + static String history[] = new String[]{"Archaeology", + "Biographies", + "Children's History", + "Classical Antiquity" + }; + + static String language_and_literature[] = new String[]{"Esperanto", + "German Language Books", + "Language Education", + "Plays" + }; + + static String law[] = new String[]{"British Law", + "Canon Law", + "Noteworthy Trials", + "United States Law" + }; + + static String music[] = new String[]{"Music", + "Opera" + }; + + static String periodicals[] = new String[]{"Ainslee's", + "The Aldine", + "The American Architect and Building News", + "The American Journal of Archaeology", + "The American Missionary", + "The American Quarterly Review", + "The Arena", + "The Argosy", + "Armour's Monthly Cook Book", + "Astounding Stories", + "The Atlantic Monthly", + "The Baptist Magazine", + "Barnavännen", + "The Bay State Monthly", + "Bird-Lore", + "Birds, Illustrated by Color Photography", + "Blackwood's Edinburgh Magazine", + "The Botanical Magazine", + "The Brochure Series of Architectural Illustration", + "Buchanan's Journal of Man", + "Bulletin of Lille", //French.Page requires translation + "The Catholic World", + "Celtic Magazine", + "Chambers's Edinburgh Journal", + "The Christian Foundation", + "The Church of England Magazine", + "The Contemporary Review", + "Continental Monthly", + "Current History", + "De Aarde en haar Volken", //Other language.Page requires translation + "Donahoe's Magazine", + "The Economist", + "The Esperantist", + "The Galaxy", + "Garden and Forest", + "Godey's Lady's Book", + "Graham's Magazine", + "Harper's New Monthly Magazine", + "Harper's Young People", + "The Idler", + "The Illustrated War News", + "The International Magazine of Literature, Art, and Science", + "The Irish Ecclesiastical Record", + "The Irish Penny Journal", + "Journal of Entomology and Zoology", + "The Journal of Negro History", + "The Knickerbocker", + "L'Illustration", + "Lippincott's Magazine", + "Little Folks", + "London Medical Gazette", + "The Mayflower", + "McClure's Magazine", + "The Menorah Journal", + "The Mentor", + "The Mirror of Literature, Amusement, and Instruction", + "The Mirror of Taste, and Dramatic Censor", + "Mother Earth", + "Mrs Whittelsey's Magazine for Mothers and Daughters", + "The National Preacher", + "The North American Medical and Surgical Journal", + "Northern Nut Growers Association", + "Notes and Queries", + "Our Young Folks", + "Poetry, A Magazine of Verse", + "Popular Science Monthly", + "Prairie Farmer", + "Punch", + "Punchinello", + "Scientific American", + "The Scrap Book", + "Scribner's Magazine", + "The Speaker", + "The Stars and Stripes", + "The Strand Magazine", + "The Haslemere Museum Gazette", + "The Unpopular Review", + "The Writer", + "The Yellow Book", + + }; + + static String psychology_and_philosophy[] = new String[]{"Bibliomania", + "Philosophy", + "Psychology", + "Witchcraft" + }; + + static String religion[] = new String[]{"Atheism", + "Bahá'í Faith", + "Buddhism", + "Christianity", + "Hinduism", + "Islam", + "Judaism", + "Latter Day Saints", + "Mythology", + "Paganism" + }; + + static String science[] = new String[]{"Astronomy", + "Biology", + "Botany", + "Chemistry", + "Ecology", + "Geology", + "Mathematics", + "Microbiology", + "Microscopy", + "Mycology", + "Natural History", + "Physics", + "Physiology", + "Science", + "Scientific American", + "Zoology" + }; + + static String social_sciences[] = new String[]{"Anarchism", + "Racism", + "Slavery", + "Sociology", + "Suffrage", + "Transportation" + }; + + static String technology[] = new String[]{"Cookery", + "Crafts", + "Engineering", + "Manufacturing", + "Technology", + "Woodwork" + + }; + + static String wars[] = new String[]{"American Revolutionary War", + "Boer War", + "English Civil War", + "Spanish American War", + "US Civil War", + "World War I", + "World War II" + }; + + static{ + sites.put(0, animals); + sites.put(1, children); + sites.put(2,classics); + sites.put(3, Countries); + sites.put(4, Crime); + sites.put(5, Knowledge); + sites.put(6, fiction); + sites.put(7, fine_arts); + sites.put(8, general_works); + sites.put(9, geography); + sites.put(10, history); + sites.put(11, language_and_literature); + sites.put(12, law); + sites.put(13, music); + sites.put(14, periodicals); + sites.put(15, psychology_and_philosophy); + sites.put(16, religion); + sites.put(17, science); + sites.put(18, social_sciences); + sites.put(19, technology); + sites.put(20, wars); + site2Link.put("Birds", "Animals-Wild_(Bookshelf)-Birds"); + site2Link.put("Insects", "Animals-Wild_(Bookshelf)-Insects"); + site2Link.put("Mammals", "Animals-Wild_(Bookshelf)-Mammals"); + site2Link.put("Reptiles and Amphibians", "Animals-Wild_(Bookshelf)-Reptiles_and_Amphibians"); + site2Link.put("Trapping", "Animals-Wild_(Bookshelf)-Trapping"); + + site2Link.put("Anthologies", "Children%27s_Anthologies_(Bookshelf)"); + site2Link.put("Biography", "Children%27s_Biography_(Bookshelf)"); + site2Link.put("Book Series", "Children%27s_Book_Series_(Bookshelf)"); + site2Link.put("Verse", "Children%27s_Verse_(Bookshelf)"); + site2Link.put("Christmas", "Christmas_(Bookshelf)"); + site2Link.put("Fiction", "Children%27s_Fiction_(Bookshelf)"); + site2Link.put("History", "Children%27s_History_(Bookshelf)"); + site2Link.put("Instructional Books", "Children%27s_Instructional_Books_(Bookshelf)"); + site2Link.put("Literature", "Children%27s_Literature_(Bookshelf)"); + site2Link.put("Myths and Fairy Tales", "Children%27s_Myths,_Fairy_Tales,_etc._(Bookshelf)"); + site2Link.put("Religion", "Children%27s_Religion_(Bookshelf)"); + site2Link.put("School Stories", "School_Stories_(Bookshelf)"); + + site2Link.put("Classics", "Category:Classics_Bookshelf"); + + site2Link.put("Africa", "Africa_(Bookshelf)"); + site2Link.put("Argentina", "Argentina_(Bookshelf)"); + site2Link.put("Australia", "Australia_(Bookshelf)"); + site2Link.put("Bulgaria","Bulgaria_(Bookshelf)"); + site2Link.put("Canada","Canada_(Bookshelf)"); + site2Link.put("Czech","Czech_(Bookshelf)"); + site2Link.put("Egypt","Egypt_(Bookshelf)"); + site2Link.put("France","France_(Bookshelf)"); + site2Link.put("Germany","Germany_(Bookshelf)"); + site2Link.put("Greece","Greece_(Bookshelf)"); + site2Link.put("India","India_(Bookshelf)"); + site2Link.put("Italy","Italy_(Bookshelf)"); + site2Link.put("New Zealand","New_Zealand"); + site2Link.put("Norway","Norway_(Bookshelf)"); + site2Link.put("South Africa","South_Africa_(Bookshelf)"); + site2Link.put("South America","South_America_(Bookshelf)"); + site2Link.put("Travel","Travel_(Bookshelf)"); + site2Link.put("United Kingdom","United_Kingdom_(Bookshelf)"); + site2Link.put("United States","United_States_(Bookshelf)"); + + site2Link.put("Crime Fiction","Crime_Fiction_(Bookshelf)"); + site2Link.put("Crime Non Fiction","Crime_Nonfiction_(Bookshelf)"); + site2Link.put("Detective Fiction","Detective_Fiction_(Bookshelf)"); + site2Link.put("Mystery Fiction","Mystery_Fiction_(Bookshelf)"); + + site2Link.put("Education","Education"); + site2Link.put("Language Education","Language_Education_(Bookshelf)"); + + site2Link.put("Adventure","Adventure_(Bookshelf)"); + site2Link.put("Children's Fiction","Children%27s_Fiction_(Bookshelf)"); + site2Link.put("Crime Fiction","Crime_Fiction_(Bookshelf)"); + site2Link.put("Detective Fiction","Detective_Fiction_(Bookshelf)"); + site2Link.put("Erotic Fiction","Erotic_Fiction_(Bookshelf)"); + site2Link.put("Fantasy","Fantasy_(Bookshelf)"); + site2Link.put("General Fiction","General_Fiction"); + site2Link.put("Gothic Fiction","Gothic_Fiction_(Bookshelf)"); + site2Link.put("Historical Fiction","Historical_Fiction_(Bookshelf)"); + site2Link.put("Horror","Horror_(Bookshelf)"); + site2Link.put("Humor","Humor_(Bookshelf)"); + site2Link.put("Movie Books","Movie_Books_(Bookshelf)"); + site2Link.put("Mystery Fiction","Mystery_Fiction_(Bookshelf)"); + site2Link.put("Precursors of Science Fiction","Precursors_of_Science_Fiction_(Bookshelf)"); + site2Link.put("Romantic Fiction","Romantic_Fiction_(Bookshelf)"); + site2Link.put("School Stories","School_Stories_(Bookshelf)"); + site2Link.put("Science Fiction","Science_Fiction_(Bookshelf)"); + site2Link.put("Western","Western_(Bookshelf)"); + + site2Link.put("Architecture","Architecture_(Bookshelf)"); + site2Link.put("Art","Art_(Bookshelf)"); + + site2Link.put("Children's Periodicals:Dew Drops","Dew_Drops_(Bookshelf)"); + site2Link.put("Children's Periodicals:The Girls Own Paper","The_Girls_Own_Paper_(Bookshelf)"); + site2Link.put("Children's Periodicals:Golden Days for Boys and Girls","Golden_Days_for_Boys_and_Girls_(Bookshelf)"); + site2Link.put("Children's Periodicals:The Great Round World And What Is Going On In It","The_Great_Round_World_And_What_Is_Going_On_In_It_(Bookshelf)"); + site2Link.put("Children's Periodicals:The Nursery","The_Nursery_(Bookshelf)"); + site2Link.put("Children's Periodicals:St. Nicholas Magazine for Boys and Girls","St._Nicholas_Magazine_for_Boys_and_Girls_(Bookshelf)"); + site2Link.put("Reference","Reference_(Bookshelf)"); + + site2Link.put("Anthropology","Anthropology_(Bookshelf)"); + site2Link.put("CIA World Factbooks","CIA_World_Factbooks_(Bookshelf)"); + site2Link.put("Folklore","Folklore_(Bookshelf)"); + site2Link.put("Maps and Cartography","Maps_and_Cartography_(Bookshelf)"); + site2Link.put("Women's Travel Journals","Women%27s_Travel_Journals_(Bookshelf)"); + + site2Link.put("Archaeology","Archaeology_(Bookshelf)"); + site2Link.put("Biographies","Biographies_(Bookshelf)"); + site2Link.put("Children's History","Children%27s_History_(Bookshelf)"); + site2Link.put("Classical Antiquity","Classical_Antiquity_(Bookshelf)"); + + site2Link.put("Esperanto","Esperanto_(Bookshelf)"); + site2Link.put("German Language Books","German_Language_Books_(Bookshelf)"); + site2Link.put("Language Education","Language_Education_(Bookshelf)"); + site2Link.put("Plays","Plays_(Bookshelf)"); + + site2Link.put("British Law","British_Law_(Bookshelf)"); + site2Link.put("Canon Law","Canon_Law"); + site2Link.put("Noteworthy Trials","Noteworthy_Trials(Bookshelf)"); + site2Link.put("United States Law","United_States_Law_(Bookshelf)"); + + site2Link.put("Music","Music_(Bookshelf)"); + site2Link.put("Opera","Opera_(Bookshelf)"); + + site2Link.put("Bibliomania","Bibliomania_(Bookshelf)"); + site2Link.put("Philosophy","Philosophy_(Bookshelf)"); + site2Link.put("Psychology","Psychology_(Bookshelf)"); + site2Link.put("Witchcraft","Witchcraft_(Bookshelf)"); + + site2Link.put("Atheism","Atheism_(Bookshelf)"); + site2Link.put("Bahá'í Faith","Bahá%27í_Faith_(Bookshelf)"); + site2Link.put("Buddhism","Buddhism_(Bookshelf)"); + site2Link.put("Christianity","Christianity_(Bookshelf)"); + site2Link.put("Hinduism","Hinduism_(Bookshelf)"); + site2Link.put("Islam","Islam_(Bookshelf)"); + site2Link.put("Judaism","Judaism_(Bookshelf)"); + site2Link.put("Latter Day Saints","Latter_Day_Saints_(Bookshelf)"); + site2Link.put("Mythology","Mythology_(Bookshelf)"); + site2Link.put("Paganism","Paganism_(Bookshelf)"); + + site2Link.put("Astronomy","Astronomy_(Bookshelf)"); + site2Link.put("Biology","Biology_(Bookshelf)"); + site2Link.put("Botany","Botany_(Bookshelf)"); + site2Link.put("Chemistry","Chemistry_(Bookshelf)"); + site2Link.put("Ecology","Ecology_(Bookshelf)"); + site2Link.put("Geology","Geology_(Bookshelf)"); + site2Link.put("Mathematics","Mathematics_(Bookshelf)"); + site2Link.put("Microbiology","Microbiology_(Bookshelf)"); + site2Link.put("Microscopy","Microscopy_(Bookshelf)"); + site2Link.put("Mycology","Mycology_(Bookshelf)"); + site2Link.put("Natural History","Natural_History_(Bookshelf)"); + site2Link.put("Physics","Physics_(Bookshelf)"); + site2Link.put("Physiology","Physiology_(Bookshelf)"); + site2Link.put("Science","Science"); + site2Link.put("Scientific American","Scientific_American_(Bookshelf)"); + site2Link.put("Zoology","Zoology_(Bookshelf)"); + + site2Link.put("Anarchism","Anarchism_(Bookshelf)"); + site2Link.put("Racism","Racism_(Bookshelf)"); + site2Link.put("Slavery","Slavery_(Bookshelf)"); + site2Link.put("Sociology","Sociology_(Bookshelf)"); + site2Link.put("Suffrage","Suffrage"); + site2Link.put("Transportation","Transportation_(Bookshelf)"); + + site2Link.put("Energy Research","fenrg"); + site2Link.put("ICT","fict"); + site2Link.put("Materials","fmats"); + site2Link.put("Mechanical Engineering","fmech"); + site2Link.put("Robotics and AI","frobt"); + site2Link.put("Communication","fcomm"); + site2Link.put("Digital Humanities","fdigh"); + site2Link.put("Sociology","fsoc"); + + site2Link.put("Cookery","Cookery_(Bookshelf)"); + site2Link.put("Crafts","Crafts_(Bookshelf)"); + site2Link.put("Engineering","Engineering_(Bookshelf)"); + site2Link.put("Manufacturing","Manufacturing"); + site2Link.put("Technology","Technology_(Bookshelf)"); + site2Link.put("Woodwork","Woodwork"); + + site2Link.put("American Revolutionary War","American_Revolutionary_War_(Bookshelf)"); + site2Link.put("Boer War","Boer_War_(Bookshelf)"); + site2Link.put("English Civil War","English_Civil_War_(Bookshelf)"); + site2Link.put("Spanish American War","Spanish_American_War_(Bookshelf)"); + site2Link.put("US Civil War","US_Civil_War_(Bookshelf)"); + site2Link.put("World War I","World_War_I_(Bookshelf)"); + site2Link.put("World War II","World_War_II_(Bookshelf)"); + + site2Link.put("Ainslee's","Ainslee%27s_(Bookshelf)"); + site2Link.put("The Aldine","The_Aldine_(Bookshelf)"); + site2Link.put("The American Architect and Building News","The_American_Architect_and_Building_News_(Bookshelf)"); + site2Link.put("The American Journal of Archaeology","The_American_Journal_of_Archaeology_(Bookshelf)"); + site2Link.put("The American Missionary","The_American_Missionary_(Bookshelf)"); + site2Link.put("The American Quarterly Review","The_American_Quarterly_Review_(Bookshelf)"); + site2Link.put("The Arena","The_Arena_(Bookshelf)"); + site2Link.put("The Argosy","The_Argosy_(Bookshelf)"); + site2Link.put("Armour's Monthly Cook Book","Armour%27s_Monthly_Cook_Book_(Bookshelf)"); + site2Link.put("Astounding Stories","Astounding_Stories_(Bookshelf)"); + site2Link.put("The Atlantic Monthly","The_Atlantic_Monthly_(Bookshelf)"); + site2Link.put("The Baptist Magazine","The_Baptist_Magazine_(Bookshelf)"); + site2Link.put("Barnavännen","Barnavännen_(Bookshelf)"); + site2Link.put("The Bay State Monthly","The_Bay_State_Monthly_(Bookshelf)"); + site2Link.put("Bird-Lore","Bird-Lore_(Bookshelf)"); + site2Link.put("Birds, Illustrated by Color Photography","Birds,_Illustrated_by_Color_Photography_(Bookshelf)"); + site2Link.put("Blackwood's Edinburgh Magazine","Blackwood%27s_Edinburgh_Magazine_(Bookshelf)"); + site2Link.put("The Botanical Magazine","The_Botanical_Magazine_(Bookshelf)"); + site2Link.put("The Brochure Series of Architectural Illustration","The_Brochure_Series_of_Architectural_Illustration_(Bookshelf)"); + site2Link.put("Buchanan's Journal of Man","Buchanan%27s_Journal_of_Man_(Bookshelf)"); + site2Link.put("Bulletin of Lille","Bulletin_de_Lille_(Bookshelf)"); + site2Link.put("The Catholic World","The_Catholic_World_(Bookshelf)"); + site2Link.put("Celtic Magazine","Celtic_Magazine_(Bookshelf)"); + site2Link.put("Chambers's Edinburgh Journal","Chambers%27s_Edinburgh_Journal_(Bookshelf)"); + site2Link.put("The Christian Foundation","The_Christian_Foundation_(Bookshelf)"); + site2Link.put("The Church of England Magazine","The_Church_of_England_Magazine_(Bookshelf)"); + site2Link.put("The Contemporary Review","The_Contemporary_Review_(Bookshelf)"); + site2Link.put("Continental Monthly","Continental_Monthly_(Bookshelf)"); + site2Link.put("Current History","Current_History_(Bookshelf)"); + site2Link.put("De Aarde en haar Volken","De_Aarde_en_haar_Volken_(Bookshelf)"); + site2Link.put("Donahoe's Magazine","Donahoe%27s_Magazine_(Bookshelf)"); + site2Link.put("The Economist","The_Economist_(Bookshelf)"); + site2Link.put("The Esperantist","The_Esperantist_(Bookshelf)"); + site2Link.put("The Galaxy","The_Galaxy_(Bookshelf)"); + site2Link.put("Garden and Forest","Garden_and_Forest_(Bookshelf)"); + site2Link.put("Godey's Lady's Book","Godey%27s_Lady%27s_Book_(Bookshelf)"); + site2Link.put("Graham's Magazine","Graham%27s_Magazine_(Bookshelf)"); + site2Link.put("Harper's New Monthly Magazine","Harper%27s_New_Monthly_Magazine_(Bookshelf)"); + site2Link.put("Harper's Young People","Harper%27s_Young_People_(Bookshelf)"); + site2Link.put("The Idler","The_Idler_(Bookshelf)"); + site2Link.put("The Illustrated War News","The_Illustrated_War_News_(Bookshelf)"); + site2Link.put("The International Magazine of Literature, Art, and Science","The_International_Magazine_of_Literature,_Art,_and_Science_(Bookshelf)"); + site2Link.put("The Irish Ecclesiastical Record","The_Irish_Ecclesiastical_Record_(Bookshelf)"); + site2Link.put("The Irish Penny Journal","The_Irish_Penny_Journal_(Bookshelf)"); + site2Link.put("Journal of Entomology and Zoology","Journal_of_Entomology_and_Zoology_(Bookshelf)"); + site2Link.put("The Journal of Negro History","The_Journal_of_Negro_History_(Bookshelf)"); + site2Link.put("The Knickerbocker","The_Knickerbocker_(Bookshelf)"); + site2Link.put("L'Illustration","L%27Illustration_(Bookshelf)"); + site2Link.put("Lippincott's Magazine","Lippincott%27s_Magazine_(Bookshelf)"); + site2Link.put("Little Folks","Little_Folks_(Bookshelf)"); + site2Link.put("London Medical Gazette","London_Medical_Gazette"); + site2Link.put("The Mayflower","The_Mayflower_(Bookshelf)"); + site2Link.put("McClure's Magazine","McClure%27s_Magazine_(Bookshelf)"); + site2Link.put("The Menorah Journal","The_Menorah_Journal_(Bookshelf)"); + site2Link.put("The Mentor","The_Mentor_(Bookshelf)"); + site2Link.put("The Mirror of Literature, Amusement, and Instruction","The_Mirror_of_Literature,_Amusement,_and_Instruction_(Bookshelf)"); + site2Link.put("The Mirror of Taste, and Dramatic Censor","The_Mirror_of_Taste,_and_Dramatic_Censor_(Bookshelf)"); + site2Link.put("Mother Earth","Mother_Earth_(Bookshelf)"); + site2Link.put("Mrs Whittelsey's Magazine for Mothers and Daughters","Mrs_Whittelsey%27s_Magazine_for_Mothers_and_Daughters_(Bookshelf)"); + site2Link.put("The National Preacher","The_National_Preacher_(Bookshelf)"); + site2Link.put("The North American Medical and Surgical Journal","The_North_American_Medical_and_Surgical_Journal_(Bookshelf)"); + site2Link.put("Northern Nut Growers Association","Northern_Nut_Growers_Association_(Bookshelf)"); + site2Link.put("Notes and Queries","Notes_and_Queries_(Bookshelf)"); + site2Link.put("Our Young Folks","Our_Young_Folks_(Bookshelf)"); + site2Link.put("Poetry, A Magazine of Verse","Poetry,_A_Magazine_of_Verse_(Bookshelf)"); + site2Link.put("Popular Science Monthly","Popular_Science_Monthly_(Bookshelf)"); + site2Link.put("Prairie Farmer","Prairie_Farmer_(Bookshelf)"); + site2Link.put("Punch","Punch_(Bookshelf)"); + site2Link.put("Punchinello","Punchinello_(Bookshelf)"); + site2Link.put("Scientific American","Scientific_American_(Bookshelf)"); + site2Link.put("The Scrap Book","The_Scrap_Book_(Bookshelf)"); + site2Link.put("Scribner's Magazine","Scribner%27s_Magazine_(Bookshelf)"); + site2Link.put("The Speaker","The_Speaker_(Bookshelf)"); + site2Link.put("The Stars and Stripes","The_Stars_and_Stripes_(Bookshelf)"); + site2Link.put("The Strand Magazine","The_Strand_Magazine_(Bookshelf)"); + site2Link.put("The Haslemere Museum Gazette","The_Haslemere_Museum_Gazette_(Bookshelf)"); + site2Link.put("The Unpopular Review","The_Unpopular_Review_(Bookshelf)"); + site2Link.put("The Writer","The_Writer_(Bookshelf)"); + site2Link.put("The Yellow Book","The_Yellow_Book_(Bookshelf)"); + + + } +} diff --git a/edu.usc.cssl.tacit.crawlers.gutenberg/src/edu/usc/cssl/tacit/crawlers/gutenberg/services/GutenbergMain.java b/edu.usc.cssl.tacit.crawlers.gutenberg/src/edu/usc/cssl/tacit/crawlers/gutenberg/services/GutenbergMain.java new file mode 100644 index 00000000..0f6e033b --- /dev/null +++ b/edu.usc.cssl.tacit.crawlers.gutenberg/src/edu/usc/cssl/tacit/crawlers/gutenberg/services/GutenbergMain.java @@ -0,0 +1,165 @@ +package edu.usc.cssl.tacit.crawlers.gutenberg.services; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; + +import org.eclipse.core.runtime.IProgressMonitor; +import org.jsoup.HttpStatusException; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import com.fasterxml.jackson.core.JsonEncoding; +import com.fasterxml.jackson.core.JsonFactory; +import com.fasterxml.jackson.core.JsonGenerator; + +import edu.usc.cssl.tacit.common.ui.views.ConsoleView; + +//import edu.usc.cssl.tacit.crawlers.frontier.services.FrontierConstants; + + +public class GutenbergMain { + JsonFactory jsonFactory; + JsonGenerator jsonGenerator; + IProgressMonitor monitor; + + public void crawl(String dir, String domain, int limit,IProgressMonitor monitor) throws IOException{ + ConsoleView.printlInConsoleln("For Sub Domain: " + domain); + //System.out.println("I am in crawl"); + this.monitor = monitor; + //System.out.println("dir------" + dir); + //System.out.println("domain---" + domain); + //System.out.println("limit-----" + limit); + ArrayList temp = new ArrayList(); + int downloadCount = 0; + jsonFactory = new JsonFactory(); + //this.monitor = monitor; + File streamFile = new File(dir+File.separator+domain+".json"); + int count =0; + int downloadedCount =0; + File f=null; + System.out.println("I am in while---and count is ----" + count); + try { + jsonGenerator = jsonFactory.createGenerator(streamFile, JsonEncoding.UTF8); + jsonGenerator.useDefaultPrettyPrinter(); + jsonGenerator.writeStartArray(); + + f = new File(dir+File.separator+domain+".txt"); + //System.out.println("File Name--------" + dir+File.separator+domain+".txt"); + //String domain1= domain.replaceAll("\\s+", "_"); + //String domain2 = domain1.replace("'", "%27"); + String domain2 = GutenbergConstants.site2Link.get(domain); + //String site = IGutenbergConstants.BASE_URL_DOMAIN + domain2 + "_(Bookshelf)"; + String site = IGutenbergConstants.BASE_URL_DOMAIN + domain2; + //System.out.println("domain2-----" + domain2); + System.out.println("site---------" + site); + Document d = Jsoup.connect(site).timeout(60*1000).get(); + Elements certainlinks = d.select("a[href*=www.gutenberg.org/ebooks/]"); + for (Element table : certainlinks){ + Element a = table.select("a").first(); + String linkStr = a.attr("href"); + //System.out.println(linkStr); + int lastIndex = linkStr.lastIndexOf('/'); + String s2 = linkStr.substring(lastIndex+1); + if (s2.matches("[-+]?\\d*\\.?\\d+")) + { + temp.add(s2); + } + //System.out.println(s2); + + } + if(limit>temp.size()) + { + limit = temp.size(); + } + //System.out.println("******************************************************"); + //System.out.println(temp); + //System.out.println("******************************************************"); + jsonGenerator.writeStartObject(); + //System.out.println("Size of temp-------" + temp.size()); + int tempSize = temp.size(); + monitor.worked(1); + + + } catch (IOException e2) { + e2.printStackTrace(); + } + + while(true){ + if(limit==downloadedCount) + break; + try{ + //System.out.println("I am in while-------------||||||||||||||||||||||||||------------------------"); + + for (int i = downloadCount;downloadedCount temp = new ArrayList(); + int downloadCount = 0; + int downloadedCount =0; + this.monitor =monitor; + jsonFactory = new JsonFactory(); + File streamFile = new File(dir+File.separator+ "Latest Search"+".json"); + try { + jsonGenerator = jsonFactory.createGenerator(streamFile, JsonEncoding.UTF8); + jsonGenerator.useDefaultPrettyPrinter(); + jsonGenerator.writeStartArray(); + } catch (IOException e2) { + e2.printStackTrace(); + } + File f=null; + try + { + f = new File(dir+File.separator+ "Latest Search" +".txt"); + monitor.worked(1); + String site = IGutenbergConstants.LATEST_SERACH; + System.out.println("site-----" + site); + Document d = Jsoup.connect(site).timeout(60*1000).get(); + Elements certainlinks = d.select("a[href*=/ebooks/]"); + //System.out.println(certainlinks); + for (Element table : certainlinks){ + Element a = table.select("a").first(); + String linkStr = a.attr("href"); + //System.out.println(linkStr); + int lastIndex = linkStr.lastIndexOf('/'); + String s2 = linkStr.substring(lastIndex+1); + if (s2.matches("[-+]?\\d*\\.?\\d+")) + { + temp.add(s2); + } + } + if(limit>temp.size()) + { + limit = temp.size(); + } + //System.out.println("temp ka size=======" + temp.size()); + //System.out.println("Limit given========" + limit); + //System.out.println(temp); + } + catch(HttpStatusException e1){ + if(e1.getStatusCode() == 412 || e1.getStatusCode() == 404) + { + System.out.println("Error Status Code is ----" + e1.getStatusCode()); + } + } + + while(true){ + if(limit==downloadedCount) + break; + //System.out.println("******************I am in if***************************"); + for(int i = downloadCount;downloadedCount" + numOfebook); + String titleSite = IGutenbergConstants.TITLE_BASE_URL + numOfebook + "/" + numOfebook + "-h/" + numOfebook + "-h.htm"; + //System.out.println(titleSite); + Document e = Jsoup.connect(titleSite).timeout(60*1000).get(); + Element title = e.select("title").first(); + //System.out.println(title); + String contentSite = IGutenbergConstants.CONTENT_BASE_URL + numOfebook + "/pg" + numOfebook + ".txt"; + System.out.println("=============>>>>>>" + contentSite); + Document g = Jsoup.connect(contentSite).timeout(60*1000).get(); + Response response = Jsoup.connect(contentSite).execute(); + ConsoleView.printlInConsoleln("Writing topic: "+ Jsoup.parse(title.toString()).text()); + jsonGenerator.writeStartObject(); + jsonGenerator.writeObjectField("title", Jsoup.parse(title.toString()).text()); + jsonGenerator.writeObjectField("abstract_body", Jsoup.parse(g.toString()).text()); + jsonGenerator.writeEndObject(); + downloadCount++; + downloadedCount++; + if(i==temp.size()-1 && downloadedCount!=limit) + { + //System.out.println("*************************************************"); + temp = searchNextPage(temp,limit); + } + //System.out.println("Download Count is=" + downloadCount); + //System.out.println("Downloaded Count is=" + downloadedCount); + } + catch(HttpStatusException e1){ + if(e1.getStatusCode() == 412 || e1.getStatusCode() == 404) + { + System.out.println("Error Status Code is ----" + e1.getStatusCode()); + //System.out.println("Continuing after error"); + downloadCount++; + //System.out.println("Download Count in Catch=" + downloadCount); + //System.out.println("Downloaded Count in Catch=" + downloadedCount); + //System.out.println("Last Book in Catch=" + lastbook); + if(i==temp.size()-1 && downloadedCount!=limit) + { + //System.out.println("Searching nextttttttttttt paggeeeeeeeeeee"); + temp = searchNextPage(temp,limit); + } + + continue; + } + } + + } + jsonGenerator.writeEndArray(); + } + + //System.out.println("*****************I am out of if******************"); + //System.out.println("I am out of extreme"); + // System.out.println(temp); + //System.out.println("temp ka size=======" + temp.size()); + try { + jsonGenerator.close(); + } catch (IOException e) { + e.printStackTrace(); + } + System.out.println("Number of Downloads--" + downloadedCount); + +} + + private ArrayList searchNextPage(ArrayList temp, int limit) throws IOException { + //System.out.println("&&&&&&&&&&&&&&&&&|||||||||||||||||||||||||||||I am indise while||||||||||||||||||||||||||||||||&&&&&&&&&&&&&&&&&"); + int diff = limit-temp.size(); + //System.out.println("Difference is----" + diff); + //System.out.println("nextpage index is---" + nextpageindex); + String site2 = IGutenbergConstants.POPULAR_SEARCH + "&start_index=" + nextpageindex; + //System.out.println("site2********************" + site2); + Document h = Jsoup.connect(site2).timeout(60*1000).get(); + Elements certainlinks1 = h.select("a[href*=/ebooks/]"); + for (Element table : certainlinks1){ + Element a = table.select("a").first(); + String linkStr = a.attr("href"); + //System.out.println(linkStr); + int lastIndex = linkStr.lastIndexOf('/'); + String s2 = linkStr.substring(lastIndex+1); + if (s2.matches("[-+]?\\d*\\.?\\d+")) + { + temp.add(s2); + } + } + nextpageindex = nextpageindex + 25; + lastbook = lastbook + 25; + //System.out.println("Last Book in method=" + lastbook); + //System.out.println(temp); + //System.out.println("Size of temp----" + temp.size()); + // TODO Auto-generated method stub + return temp; + } +} diff --git a/edu.usc.cssl.tacit.crawlers.gutenberg/src/edu/usc/cssl/tacit/crawlers/gutenberg/services/SearchPopular.java b/edu.usc.cssl.tacit.crawlers.gutenberg/src/edu/usc/cssl/tacit/crawlers/gutenberg/services/SearchPopular.java new file mode 100644 index 00000000..5c17db6b --- /dev/null +++ b/edu.usc.cssl.tacit.crawlers.gutenberg/src/edu/usc/cssl/tacit/crawlers/gutenberg/services/SearchPopular.java @@ -0,0 +1,226 @@ +package edu.usc.cssl.tacit.crawlers.gutenberg.services; + + + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; + +//import org.eclipse.core.runtime.IProgressMonitor; +//import org.jsoup.HttpStatusException; +import org.jsoup.Jsoup; +import org.jsoup.Connection.Response; +import org.eclipse.core.runtime.IProgressMonitor; +import org.jsoup.HttpStatusException; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import com.fasterxml.jackson.core.JsonEncoding; +import com.fasterxml.jackson.core.JsonFactory; +import com.fasterxml.jackson.core.JsonGenerator; + +import edu.usc.cssl.tacit.common.ui.views.ConsoleView; + + +public class SearchPopular { + JsonFactory jsonFactory; + JsonGenerator jsonGenerator; + //IProgressMonitor monitor; + public int nextpageindex=26; + ArrayList temp = new ArrayList(); + ArrayList checktemp = new ArrayList(); + IProgressMonitor monitor; + + + //popular method will go the site and get the books in arraylist called temp + public void popular(String dir, int limit,String query,IProgressMonitor monitor) throws IOException { + int downloadCount = 0; + int downloadedCount =0; + this.monitor = monitor; + jsonFactory = new JsonFactory(); + File streamFile = new File(dir+File.separator+ query +".json"); + try { + jsonGenerator = jsonFactory.createGenerator(streamFile, JsonEncoding.UTF8); + jsonGenerator.useDefaultPrettyPrinter(); + jsonGenerator.writeStartArray(); + } catch (IOException e2) { + e2.printStackTrace(); + } + File f=null; + try + { + f = new File(dir+File.separator+ query +".txt"); + monitor.worked(1); + String site = IGutenbergConstants.POPULAR_SEARCH+query; + System.out.println("site-----" + site); + Document d = Jsoup.connect(site).timeout(60*1000).get(); + Elements certainlinks = d.select("a[href*=/ebooks/]"); + //System.out.println(certainlinks); + for (Element table : certainlinks){ + Element a = table.select("a").first(); + String linkStr = a.attr("href"); + //System.out.println(linkStr); + int lastIndex = linkStr.lastIndexOf('/'); + String s2 = linkStr.substring(lastIndex+1); + if (s2.matches("[-+]?\\d*\\.?\\d+")) + { + temp.add(s2); + } + } + //System.out.println("temp ka size=======" + temp.size()); + //System.out.println("Limit given========" + limit); + //System.out.println(temp); + monitor.worked(1); + if(temp.size()==0) + { + //when search result returns nothing. i.e no books according to user search + ConsoleView.printlInConsoleln("No books found according to your search.Kindly change your search options or try again later."); + } + if((limit==temp.size() || limit < temp.size() || limit>temp.size()) && temp.size()!=0) + { + //Two of the three cases i told you + lessThanOrEqualTo(limit,query,temp); + } + } + catch(HttpStatusException e1){ + if(e1.getStatusCode() == 412 || e1.getStatusCode() == 404) + { + System.out.println("Error Status Code is ----" + e1.getStatusCode()); + } + } + + try { + jsonGenerator.close(); + } catch (IOException e) { + e.printStackTrace(); + } + //System.out.println("Number of Downloads-|||||||||||||||||||||||||||||||||||" + downloadCount); + + } + + + /*Cases : + * Case 0 : limit 5, temp 25, can get more in temp, 0 fails of first 25. + * Case 1: limit 5, temp 25, can get more in temp, 22 fails of first 25 + * Case 2: limit 27, temp 25, can get more in temp, 10 fails of first 25 + * Case 3: limit 10, temp 25, can't get more in temp, 20 fails of first 25. + * Case 4: limit 20, temp 15, can't get more in temp, 0 fails of first 25. + * + */ + //lessthanOrEqualTo method will now extract title and content from the books in temp + private void lessThanOrEqualTo(int limit, String query, ArrayList temp) throws IOException { + int downloadedCount =0; + //int downloadCount =0; + while(true){ + if(downloadedCount==limit) + { + break; + } + for(int i = 0;i" + numOfebook); + String titleSite = IGutenbergConstants.TITLE_BASE_URL + numOfebook + "/" + numOfebook + "-h/" + numOfebook + "-h.htm"; + //System.out.println(titleSite); + Document e = Jsoup.connect(titleSite).timeout(60*1000).get(); + Element title = e.select("title").first(); + //System.out.println(title); + String contentSite = IGutenbergConstants.CONTENT_BASE_URL + numOfebook + "/pg" + numOfebook + ".txt"; + //System.out.println("=============>>>>>>" + contentSite); + Document g = Jsoup.connect(contentSite).timeout(60*1000).get(); + Response response = Jsoup.connect(contentSite).execute(); + ConsoleView.printlInConsoleln("Writing topic: "+ Jsoup.parse(title.toString()).text()); + jsonGenerator.writeStartObject(); + jsonGenerator.writeObjectField("title", Jsoup.parse(title.toString()).text()); + jsonGenerator.writeObjectField("abstract_body", Jsoup.parse(g.toString()).text()); + jsonGenerator.writeEndObject(); + //downloadCount++; + downloadedCount++; + //System.out.println("Download Count is=" + downloadCount); + //System.out.println("Downloaded Count is=" + downloadedCount); + } + catch(HttpStatusException e1){ + if(e1.getStatusCode() == 412 || e1.getStatusCode() == 404) + { + System.out.println("Error Status Code is ----" + e1.getStatusCode()); + //System.out.println("Continuing after error"); + //downloadCount++; + //System.out.println("Download Count in Catch=" + downloadCount); + //System.out.println("Downloaded Count in Catch=" + downloadedCount); + continue; + } + } + } + if(downloadedCount!=limit) + { + //This is a check when end of temp is reached and also downloadedCount is not equal to limit so we need to search next page + //System.out.println("I should search next page now probably"); + //downloadCount++; + checktemp = searchnextpage(query,limit); //searchnextpage will search the nextpage of site and return next ebooks that will be stored in checktemp + if(checktemp.isEmpty()) + { + //means next page is empty + //System.out.println("Next temp not found"); + break; + } + else + { + temp.clear(); + temp.addAll(checktemp); + } + } + + } + jsonGenerator.writeEndArray(); + if(downloadedCount==0) + { + ConsoleView.printlInConsoleln("No books found according to your search.Kindly change your search options or try again later."); + } + ConsoleView.printlInConsoleln(downloadedCount + " book(s) downloaded according to specified search result."); + } + + + + private ArrayList searchnextpage(String query, int limit) throws IOException { + System.out.println("&&&&&&&&&&&&&&&&&I am indise SEARCHNEXT&&&&&&&&&&&&&&&&&"); + ArrayList newtemp = new ArrayList(); + int diff = limit-temp.size(); + System.out.println("Difference is----" + diff); + System.out.println("nextpage index is---" + nextpageindex); + String site2 = IGutenbergConstants.POPULAR_SEARCH + query + "&start_index=" + nextpageindex; + System.out.println("site2********************" + site2); + Document h = Jsoup.connect(site2).timeout(60*1000).get(); + Elements certainlinks1 = h.select("a[href*=/ebooks/]"); + for (Element table : certainlinks1){ + Element a = table.select("a").first(); + String linkStr = a.attr("href"); + //System.out.println(linkStr); + int lastIndex = linkStr.lastIndexOf('/'); + String s2 = linkStr.substring(lastIndex+1); + if (s2.matches("[-+]?\\d*\\.?\\d+")) + { + newtemp.add(s2); + } + } + nextpageindex = nextpageindex + 25; + //System.out.println("Lets check new temp"); + //System.out.println("############################################"); + //System.out.println(newtemp); + //System.out.println("############################################"); + return newtemp; + + } + + + + +} + diff --git a/edu.usc.cssl.tacit.feature/feature.xml b/edu.usc.cssl.tacit.feature/feature.xml index 20a09390..3db8a305 100644 --- a/edu.usc.cssl.tacit.feature/feature.xml +++ b/edu.usc.cssl.tacit.feature/feature.xml @@ -542,5 +542,20 @@ version="0.0.0" unpack="false"/> + + + + + diff --git a/edu.usc.cssl.tacit.repository/plugin.xml b/edu.usc.cssl.tacit.repository/plugin.xml index 4e3c64f7..18f6f5e6 100644 --- a/edu.usc.cssl.tacit.repository/plugin.xml +++ b/edu.usc.cssl.tacit.repository/plugin.xml @@ -290,6 +290,15 @@ relative="org.eclipse.ui.editorss" visible="false"> + + + + diff --git a/parent/pom.xml b/parent/pom.xml index 3326afc0..bcc85831 100644 --- a/parent/pom.xml +++ b/parent/pom.xml @@ -54,6 +54,9 @@ ../edu.usc.cssl.tacit.crawlers.govtrack.ui ../edu.usc.cssl.tacit.crawlers.govtrack + + ../edu.usc.cssl.tacit.crawlers.gutenberg.ui + ../edu.usc.cssl.tacit.crawlers.gutenberg