Merge pull request #25 from academic/feature/pdf-parser

Feature/pdf parser
academic · Mar 17, 2018 · fb89840 · fb89840
2 parents 76c75ee + d9a360c
commit fb89840
Show file tree

Hide file tree

Showing 6 changed files with 176 additions and 43 deletions.
diff --git a/src/main/java/io/academic/entity/Article.java b/src/main/java/io/academic/entity/Article.java
@@ -11,6 +11,10 @@
 @Entity
 public class Article extends AbstractAuditingEntity {
 
+    public Article(){
+
+    }
+
     @Column
     @Type(type = "text")
     private String title;
@@ -43,6 +47,18 @@ public class Article extends AbstractAuditingEntity {
     @Type(type = "text")
     private String type;
 
+    @Column
+    @Type(type = "text")
+    private String base64;
+
+    @Column
+    @Type(type = "text")
+    private String articleIdentifier;
+
+    @Column
+    @Type(type = "text")
+    private String relation;
+
     public String getTitle() {
         return title;
     }
@@ -106,4 +122,28 @@ public String getType() {
     public void setType(String type) {
         this.type = type;
     }
+
+    public String getBase64() {
+        return base64;
+    }
+
+    public void setBase64(String base64) {
+        this.base64 = base64;
+    }
+
+    public String getArticleIdentifier() {
+        return articleIdentifier;
+    }
+
+    public void setArticleIdentifier(String articleIdentifier) {
+        this.articleIdentifier = articleIdentifier;
+    }
+
+    public String getRelation() {
+        return relation;
+    }
+
+    public void setRelation(String relation) {
+        this.relation = relation;
+    }
 }
diff --git a/src/main/java/io/academic/entity/OaiDataProvider.java b/src/main/java/io/academic/entity/OaiDataProvider.java
@@ -16,12 +16,18 @@ public OaiDataProvider(){
 
     }
 
-    public OaiDataProvider(String name, String url, String identifier) {
+    public OaiDataProvider(String name, String url,String downloadUrl, String identifier) {
         this.name = name;
         this.url = url;
+        this.downloadUrl = downloadUrl;
         this.identifier = identifier;
     }
 
+    public OaiDataProvider(String url,String downloadUrl) {
+        this.url = url;
+        this.downloadUrl = downloadUrl;
+    }
+
     public OaiDataProvider(String url) {
         this.url = url;
     }
@@ -32,6 +38,9 @@ public OaiDataProvider(String url) {
     @Column
     private String url;
 
+    @Column
+    private String downloadUrl;
+
     @Column
     private String identifier;
 
@@ -69,5 +78,12 @@ public OaiDataProvider setIdentifier(String identifier) {
         return this;
     }
 
+    public String getDownloadUrl() {
+        return downloadUrl;
+    }
+
+    public void setDownloadUrl(String downloadUrl) {
+        this.downloadUrl = downloadUrl;
+    }
 
 }
diff --git a/src/main/java/io/academic/service/AcademicSearchService.java b/src/main/java/io/academic/service/AcademicSearchService.java
@@ -6,6 +6,7 @@
 import com.google.gson.JsonParser;
 import org.elasticsearch.action.search.*;
 import org.elasticsearch.common.unit.TimeValue;
+import org.elasticsearch.index.query.QueryBuilder;
 import org.elasticsearch.index.query.QueryBuilders;
 import org.elasticsearch.search.Scroll;
 import org.elasticsearch.search.SearchHit;
@@ -16,6 +17,8 @@
 import org.springframework.stereotype.Service;
 
 import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
 
 import static org.elasticsearch.index.query.QueryBuilders.matchQuery;
 
@@ -30,8 +33,11 @@ public AcademicSearchService() {
 
     public String search(String q) throws IOException {
 
+        ArrayList<String> criterias = new ArrayList<String>();
+        criterias.add("dc");
+        criterias.add("content");
         SearchRequest searchRequest = new SearchRequest("harvester");
-        searchRequest.source(buildSource("term","dc",q,false));
+        searchRequest.source(buildSource("term",criterias,q,false));
 
         //this values are necessary if we need scrollable results (in other words if our result have more than 10 hits)
         final Scroll scroll = new Scroll(TimeValue.timeValueMinutes(1));
@@ -47,8 +53,10 @@ public String search(String q) throws IOException {
 
     public String searchBy(String q, String criteria) throws IOException {
 
+        ArrayList<String> criterias = new ArrayList<String>();
+        criterias.add(criteria);
         SearchRequest searchRequest = new SearchRequest("harvester");
-        searchRequest.source(buildSource("match",criteria,q,false));
+        searchRequest.source(buildSource("match",criterias,q,false));
 
         //this values are necessary if we need scrollable results (in other words if our result have more than 10 hits)
         final Scroll scroll = new Scroll(TimeValue.timeValueMinutes(1));
@@ -66,8 +74,10 @@ public String searchBy(String q, String criteria) throws IOException {
 
     public String getAll() throws IOException {
 
+        ArrayList<String> criterias = new ArrayList<String>();
+        criterias.add("");
         SearchRequest searchRequest = new SearchRequest("harvester");
-        searchRequest.source(buildSource("matchAll","","",true));
+        searchRequest.source(buildSource("matchAll",criterias,"",true));
 
         //this values are necessary if we need scrollable results (in other words if our result have more than 10 hits)
         final Scroll scroll = new Scroll(TimeValue.timeValueMinutes(1));
@@ -128,16 +138,16 @@ public String toJson(String nonJsonString){
 
 
 
-    public SearchSourceBuilder buildSource(String queryType, String criteria, String q, Boolean showAllFields){
+    public SearchSourceBuilder buildSource(String queryType, ArrayList<String> criteria, String q, Boolean showAllFields){
         SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder();
 
         if (queryType.equals("match"))
         {
-            searchSourceBuilder.query(matchQuery(criteria,q));
+            searchSourceBuilder.query(QueryBuilders.matchQuery(criteria.get(0),q));
         }
         else if (queryType.equals("term"))
         {
-            searchSourceBuilder.query(QueryBuilders.termQuery(criteria,q));
+            searchSourceBuilder.query(QueryBuilders.multiMatchQuery(q,criteria.toArray(new String[criteria.size()])));
         }
         else
         {
@@ -147,7 +157,8 @@ else if (queryType.equals("term"))
         searchSourceBuilder.sort(new FieldSortBuilder("title.keyword").order(SortOrder.DESC));
         if (!showAllFields)
         {
-            String[] includeFields = new String[] {"title",criteria};
+            criteria.add("title");
+            String[] includeFields = criteria.toArray(new String[criteria.size()]);
             String[] excludeFields = new String[] {""};
             searchSourceBuilder.fetchSource(includeFields,excludeFields);
             searchSourceBuilder.fetchSource(true);

diff --git a/src/main/java/io/academic/service/OaiDataProviderService.java b/src/main/java/io/academic/service/OaiDataProviderService.java
@@ -77,7 +77,7 @@ public boolean submitUrl(String url) throws InterruptedException {
     public String addRule(String url)
     {
 //        String rule = "?metadataPrefix=oai_dc&verb=ListRecords";
-        String rule = "?from=2018-01-07&until=2018-01-08&metadataPrefix=oai_dc&verb=ListRecords";
+        String rule = "?from=2017-01-01&until=2017-01-02&metadataPrefix=oai_dc&verb=ListRecords";
         return url+rule;
     }
 

diff --git a/src/main/java/io/academic/service/OaiService.java b/src/main/java/io/academic/service/OaiService.java
@@ -9,20 +9,23 @@
 import com.google.gson.GsonBuilder;
 import com.google.gson.JsonElement;
 import com.google.gson.JsonParser;
-import eu.luminis.elastic.document.DocumentService;
-import eu.luminis.elastic.document.IndexRequest;
-import eu.luminis.elastic.document.UpdateRequest;
-import eu.luminis.elastic.index.IndexService;
-import eu.luminis.elastic.search.SearchService;
 import io.academic.dao.DcDao;
 import io.academic.entity.*;
+import org.apache.commons.io.IOUtils;
 import org.apache.http.HttpHost;
+import org.elasticsearch.ElasticsearchException;
+import org.elasticsearch.action.DocWriteResponse;
 import org.elasticsearch.action.admin.indices.delete.DeleteIndexRequest;
+import org.elasticsearch.action.index.IndexRequest;
+import org.elasticsearch.action.index.IndexResponse;
 import org.elasticsearch.action.search.*;
+import org.elasticsearch.action.support.replication.ReplicationResponse;
 import org.elasticsearch.client.RestClient;
 import org.elasticsearch.client.RestHighLevelClient;
 import org.elasticsearch.common.unit.TimeValue;
+import org.elasticsearch.common.xcontent.XContentType;
 import org.elasticsearch.index.query.QueryBuilders;
+import org.elasticsearch.rest.RestStatus;
 import org.elasticsearch.search.Scroll;
 import org.elasticsearch.search.SearchHit;
 import org.elasticsearch.search.builder.SearchSourceBuilder;
@@ -39,7 +42,13 @@
 import org.springframework.stereotype.Service;
 
 import javax.transaction.Transactional;
+import java.io.BufferedInputStream;
+import java.io.BufferedReader;
 import java.io.IOException;
+import java.io.InputStreamReader;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.net.URLConnection;
 import java.time.LocalDate;
 import java.time.LocalDateTime;
 import java.time.format.DateTimeFormatter;
@@ -81,28 +90,30 @@ public RestHighLevelClient getRestClient() {
     public static final String INDEX = "harvester";
     private static final String TYPE = "oai";
 
-    private  DocumentService documentService = null;
-    private  IndexService indexService = null;
-    private  SearchService searchService = null;
-    private IndexRequest request;
 
     @Autowired
-    public OaiService(DocumentService documentService, IndexService indexService, SearchService searchService) {
-        this.documentService = documentService;
-        this.indexService = indexService;
-//        indexService.createIndex();
-        this.searchService = searchService;
-    }
-
-    public String elasticSave(Article article) {
-        IndexRequest request = new IndexRequest(INDEX, TYPE).setEntity(article);
+    public OaiService()
+    {
 
+    }
 
-        if (article.getId() != null) {
-            request.setId(String.valueOf(article.getId()));
-        }
+    public void elasticSave(Article article) throws IOException {
+        IndexRequest request = new IndexRequest(INDEX,TYPE);
+        request.setPipeline("academic-pdf");
+        // before using this pipeline we have to add pipeline to the elasticsearch by following command
+//        PUT _ingest/pipeline/academic-pdf
+//        {
+//            "description": "parse pdfs and index into ES",
+//                "processors" :
+//                  [
+//                      { "attachment" : { "field": "pdf" } },
+//                      { "remove" : { "field": "pdf" } }
+//                  ]
+//        }
+
+            request.source(new Gson().toJson(article), XContentType.JSON);
+            IndexResponse indexResponse = restClient.index(request);
 
-        return documentService.index(request);
     }
 
 
@@ -131,6 +142,7 @@ public void saveRecords(List<RecordType> recordTypes) {
             oaiRecord.setState(0);
             oaiRecords.add(oaiRecord);
 
+            //TODO: we ave to check all the parts name and assigned according to related name not order
             String[] parts = parsedDc.getDc().split(";;");
             Article article = new Article();
             article.setTitle(parts[0].split("::")[1]);
@@ -140,10 +152,27 @@ public void saveRecords(List<RecordType> recordTypes) {
             article.setPublisher(parts[4].split("::")[1]);
             article.setDate(parts[5].split("::")[1]);
             article.setType(parts[6].split("::")[1]);
+            if (parts.length>10)
+            {
+                String downlaodUrl = parts[10].split("::")[1];
+                article.setRelation(downlaodUrl);
+                article.setBase64(UrlPdftoBase64(downlaodUrl));
+            }
+            else
+            {
+                article.setRelation("not available");
+                article.setBase64("bm90IGF2YWlsYWJsZQ=="); //it means not available in base 64
+            }
             article.setDc(parsedDc.getDc());
+            article.setArticleIdentifier(parseIdentifier(oaiRecord.getIdentifier()));
+
             articles.add(article);
 
-            elasticSave(article);
+            try {
+                elasticSave(article);
+            } catch (IOException e) {
+                e.printStackTrace();
+            }
         });
 
         oaiRecordRepository.save(oaiRecords);
@@ -167,6 +196,31 @@ private String marshallDc(MetadataType metadataType) {
         }
     }
 
+    public String UrlPdftoBase64(String url) {
+        URL oracle = null;
+        String base64 = "bm90IGF2YWlsYWJsZQ=="; //means not available
+        System.out.println(url);
+        try {
+            oracle = new URL(url);
+            URLConnection yc = oracle.openConnection();
+
+            BufferedInputStream bis = new BufferedInputStream(yc.getInputStream());
+
+            byte bytes[] = IOUtils.toByteArray(bis);
+            bis.close();
+             base64 = Base64.getEncoder().encodeToString(bytes);
+            System.out.println(url);
+            System.out.println(base64);
+        } catch (MalformedURLException e) {
+            e.printStackTrace();
+        } catch (IOException e) {
+            e.printStackTrace();
+        }
+
+        return base64;
+
+    }
+
     private LocalDateTime parseDateTime(String string) {
         LocalDateTime ldt;
         DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd['T'HH:mm:ss'Z']");
@@ -182,12 +236,27 @@ private LocalDateTime parseDateTime(String string) {
 //        return LocalDateTime.parse(string, formatter);
     }
 
+    private String parseIdentifier(String oaiId){
+        String Id="";
+        Id = oaiId.substring(oaiId.lastIndexOf(':') + 1); // split identifier with ":" and take last part
+        Id = Id.substring(Id.lastIndexOf('/') + 1); // split identifier with "/" and take last part
+        return Id;
+    }
 
-    public void delete() throws IOException {
 
-        //TODO:check if there is any indices with that name
-        DeleteIndexRequest request = new DeleteIndexRequest("harvester");
-        restClient.indices().deleteIndex(request);
+    public void delete() {
+
+        try {
+            DeleteIndexRequest request = new DeleteIndexRequest("harvester");
+            restClient.indices().deleteIndex(request);
+        } catch (ElasticsearchException exception) {
+            if (exception.status() == RestStatus.NOT_FOUND) {
+                System.out.println("Index not found and not deleted");
+            }
+        } catch (IOException e) {
+            e.printStackTrace();
+        }
+
 
     }