parsed pdf added to search result

academic · Mar 16, 2018 · d9a360c · d9a360c
1 parent 6e47e22
commit d9a360c
Show file tree

Hide file tree

Showing 3 changed files with 33 additions and 95 deletions.
diff --git a/src/main/java/io/academic/service/AcademicSearchService.java b/src/main/java/io/academic/service/AcademicSearchService.java
@@ -6,6 +6,7 @@
 import com.google.gson.JsonParser;
 import org.elasticsearch.action.search.*;
 import org.elasticsearch.common.unit.TimeValue;
+import org.elasticsearch.index.query.QueryBuilder;
 import org.elasticsearch.index.query.QueryBuilders;
 import org.elasticsearch.search.Scroll;
 import org.elasticsearch.search.SearchHit;
@@ -16,6 +17,8 @@
 import org.springframework.stereotype.Service;
 
 import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
 
 import static org.elasticsearch.index.query.QueryBuilders.matchQuery;
 
@@ -30,8 +33,11 @@ public AcademicSearchService() {
 
     public String search(String q) throws IOException {
 
+        ArrayList<String> criterias = new ArrayList<String>();
+        criterias.add("dc");
+        criterias.add("content");
         SearchRequest searchRequest = new SearchRequest("harvester");
-        searchRequest.source(buildSource("term","dc",q,false));
+        searchRequest.source(buildSource("term",criterias,q,false));
 
         //this values are necessary if we need scrollable results (in other words if our result have more than 10 hits)
         final Scroll scroll = new Scroll(TimeValue.timeValueMinutes(1));
@@ -47,8 +53,10 @@ public String search(String q) throws IOException {
 
     public String searchBy(String q, String criteria) throws IOException {
 
+        ArrayList<String> criterias = new ArrayList<String>();
+        criterias.add(criteria);
         SearchRequest searchRequest = new SearchRequest("harvester");
-        searchRequest.source(buildSource("match",criteria,q,false));
+        searchRequest.source(buildSource("match",criterias,q,false));
 
         //this values are necessary if we need scrollable results (in other words if our result have more than 10 hits)
         final Scroll scroll = new Scroll(TimeValue.timeValueMinutes(1));
@@ -66,8 +74,10 @@ public String searchBy(String q, String criteria) throws IOException {
 
     public String getAll() throws IOException {
 
+        ArrayList<String> criterias = new ArrayList<String>();
+        criterias.add("");
         SearchRequest searchRequest = new SearchRequest("harvester");
-        searchRequest.source(buildSource("matchAll","","",true));
+        searchRequest.source(buildSource("matchAll",criterias,"",true));
 
         //this values are necessary if we need scrollable results (in other words if our result have more than 10 hits)
         final Scroll scroll = new Scroll(TimeValue.timeValueMinutes(1));
@@ -128,16 +138,16 @@ public String toJson(String nonJsonString){
 
 
 
-    public SearchSourceBuilder buildSource(String queryType, String criteria, String q, Boolean showAllFields){
+    public SearchSourceBuilder buildSource(String queryType, ArrayList<String> criteria, String q, Boolean showAllFields){
         SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder();
 
         if (queryType.equals("match"))
         {
-            searchSourceBuilder.query(matchQuery(criteria,q));
+            searchSourceBuilder.query(QueryBuilders.matchQuery(criteria.get(0),q));
         }
         else if (queryType.equals("term"))
         {
-            searchSourceBuilder.query(QueryBuilders.termQuery(criteria,q));
+            searchSourceBuilder.query(QueryBuilders.multiMatchQuery(q,criteria.toArray(new String[criteria.size()])));
         }
         else
         {
@@ -147,7 +157,8 @@ else if (queryType.equals("term"))
         searchSourceBuilder.sort(new FieldSortBuilder("title.keyword").order(SortOrder.DESC));
         if (!showAllFields)
         {
-            String[] includeFields = new String[] {"title",criteria};
+            criteria.add("title");
+            String[] includeFields = criteria.toArray(new String[criteria.size()]);
             String[] excludeFields = new String[] {""};
             searchSourceBuilder.fetchSource(includeFields,excludeFields);
             searchSourceBuilder.fetchSource(true);

diff --git a/src/main/java/io/academic/service/OaiService.java b/src/main/java/io/academic/service/OaiService.java
@@ -90,67 +90,30 @@ public RestHighLevelClient getRestClient() {
     public static final String INDEX = "harvester";
     private static final String TYPE = "oai";
 
-//    private  DocumentService documentService = null;
-//    private  IndexService indexService = null;
-//    private  SearchService searchService = null;
-//    private IndexRequest request;
-
-//    @Autowired
-//    public OaiService(DocumentService documentService, IndexService indexService, SearchService searchService) {
-//        this.documentService = documentService;
-//        this.indexService = indexService;
-////        indexService.createIndex();
-//        this.searchService = searchService;
-//    }
+
     @Autowired
     public OaiService()
     {
 
     }
 
     public void elasticSave(Article article) throws IOException {
-//        System.out.println("inside elasticsave");
-
-//        IndexRequest request = new IndexRequest(INDEX, TYPE).setEntity(article);
-//        System.out.println("before article get Article Identifier");
-//        System.out.println(article.getArticleIdentifier());
-//        if (article.getArticleIdentifier() != null) {
-//            request.setId(String.valueOf(article.getId()));
-//            System.out.println("inside article getid");
         IndexRequest request = new IndexRequest(INDEX,TYPE);
         request.setPipeline("academic-pdf");
-//            System.out.println(new Gson().toJson(article));
-            request.source(new Gson().toJson(article), XContentType.JSON);
+        // before using this pipeline we have to add pipeline to the elasticsearch by following command
+//        PUT _ingest/pipeline/academic-pdf
+//        {
+//            "description": "parse pdfs and index into ES",
+//                "processors" :
+//                  [
+//                      { "attachment" : { "field": "pdf" } },
+//                      { "remove" : { "field": "pdf" } }
+//                  ]
 //        }
 
-
-
+            request.source(new Gson().toJson(article), XContentType.JSON);
             IndexResponse indexResponse = restClient.index(request);
 
-//        String index = indexResponse.getIndex();
-//        String type = indexResponse.getType();
-//        String id = indexResponse.getId();
-//        long version = indexResponse.getVersion();
-//        System.out.println(index+" ,"+type+", "+id+", "+version);
-//        if (indexResponse.getResult() == DocWriteResponse.Result.CREATED) {
-//
-//        } else if (indexResponse.getResult() == DocWriteResponse.Result.UPDATED) {
-//
-//        }
-//        ReplicationResponse.ShardInfo shardInfo = indexResponse.getShardInfo();
-//        if (shardInfo.getTotal() != shardInfo.getSuccessful()) {
-//
-//        }
-//        if (shardInfo.getFailed() > 0) {
-//            for (ReplicationResponse.ShardInfo.Failure failure : shardInfo.getFailures()) {
-//                String reason = failure.reason();
-//            }
-//        }
-
-
-
-
-//        return documentService.index(request);
     }
 
 
@@ -179,11 +142,9 @@ public void saveRecords(List<RecordType> recordTypes) {
             oaiRecord.setState(0);
             oaiRecords.add(oaiRecord);
 
+            //TODO: we ave to check all the parts name and assigned according to related name not order
             String[] parts = parsedDc.getDc().split(";;");
             Article article = new Article();
-//            System.out.println("article create sonrasi article id : "+article.getId());
-//            System.out.println("article create sonrasi oai id : "+oaiRecord.getId());
-
             article.setTitle(parts[0].split("::")[1]);
             article.setAuthors(parts[1].split("::")[1]);
             article.setKeywords(parts[2].split("::")[1]);
@@ -195,23 +156,18 @@ public void saveRecords(List<RecordType> recordTypes) {
             {
                 String downlaodUrl = parts[10].split("::")[1];
                 article.setRelation(downlaodUrl);
-//                article.setBase64("not available");
                 article.setBase64(UrlPdftoBase64(downlaodUrl));
             }
             else
             {
                 article.setRelation("not available");
-                article.setBase64("bm90IGF2YWlsYWJsZQ==");
+                article.setBase64("bm90IGF2YWlsYWJsZQ=="); //it means not available in base 64
             }
             article.setDc(parsedDc.getDc());
             article.setArticleIdentifier(parseIdentifier(oaiRecord.getIdentifier()));
-//            article.setArticleIdentifier(oaiRecord.getIdentifier());
 
-//            System.out.println("article add oncesi article id : "+article.getId());
             articles.add(article);
 
-//            System.out.println("elastic save oncesi article id : "+article.getId());
-//            System.out.println("elastic save oncesi article title : "+article.getTitle());
             try {
                 elasticSave(article);
             } catch (IOException e) {
@@ -247,13 +203,8 @@ public String UrlPdftoBase64(String url) {
         try {
             oracle = new URL(url);
             URLConnection yc = oracle.openConnection();
-//            BufferedReader in = new BufferedReader(new InputStreamReader(
-//                    yc.getInputStream()));
+
             BufferedInputStream bis = new BufferedInputStream(yc.getInputStream());
-//            String inputLine;
-//            while ((inputLine = in.readLine()) != null)
-//                System.out.println(inputLine);
-//            in.close();
 
             byte bytes[] = IOUtils.toByteArray(bis);
             bis.close();
@@ -266,31 +217,8 @@ public String UrlPdftoBase64(String url) {
             e.printStackTrace();
         }
 
-
-
-
         return base64;
 
-//            String inputLine;
-//            while ((inputLine = in.readLine()) != null)
-//                System.out.println(inputLine);
-//            in.close();
-
-
-
-
-//            BufferedReader in = new BufferedReader(
-//                    new InputStreamReader(oracle.openStream()));
-//            byte bytes[] = IOUtils.toByteArray(oracle);
-
-
-//            String b64String = Base64.
-
-//            String inputLine;
-//            while ((inputLine = in.readLine()) != null)
-//                System.out.println(inputLine);
-//            in.close();
-
     }
 
     private LocalDateTime parseDateTime(String string) {

diff --git a/src/main/java/io/academic/service/ProcessorService.java b/src/main/java/io/academic/service/ProcessorService.java
@@ -38,8 +38,7 @@ public void startProcessors() {
         oaiDataProviderService.queue(new OaiDataProvider("Acta Medica Anatolia","http://dergipark.gov.tr/api/public/oai/","http://dergipark.gov.tr/download/article-file/","dergipark.ulakbim.gov.tr"  ));
 //        oaiDataProviderService.queue(new OaiDataProvider("http://export.arxiv.org/oai2","https://arxiv.org/pdf/"));
 
-            oaiService.delete();
-
+        oaiService.delete();
 
     }