Skip to content
This repository has been archived by the owner on Oct 22, 2020. It is now read-only.

Commit

Permalink
Merge pull request #25 from academic/feature/pdf-parser
Browse files Browse the repository at this point in the history
Feature/pdf parser
  • Loading branch information
Hüseyin Mert authored Mar 17, 2018
2 parents 76c75ee + d9a360c commit fb89840
Show file tree
Hide file tree
Showing 6 changed files with 176 additions and 43 deletions.
40 changes: 40 additions & 0 deletions src/main/java/io/academic/entity/Article.java
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@
@Entity
public class Article extends AbstractAuditingEntity {

public Article(){

}

@Column
@Type(type = "text")
private String title;
Expand Down Expand Up @@ -43,6 +47,18 @@ public class Article extends AbstractAuditingEntity {
@Type(type = "text")
private String type;

@Column
@Type(type = "text")
private String base64;

@Column
@Type(type = "text")
private String articleIdentifier;

@Column
@Type(type = "text")
private String relation;

public String getTitle() {
return title;
}
Expand Down Expand Up @@ -106,4 +122,28 @@ public String getType() {
public void setType(String type) {
this.type = type;
}

public String getBase64() {
return base64;
}

public void setBase64(String base64) {
this.base64 = base64;
}

public String getArticleIdentifier() {
return articleIdentifier;
}

public void setArticleIdentifier(String articleIdentifier) {
this.articleIdentifier = articleIdentifier;
}

public String getRelation() {
return relation;
}

public void setRelation(String relation) {
this.relation = relation;
}
}
18 changes: 17 additions & 1 deletion src/main/java/io/academic/entity/OaiDataProvider.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,18 @@ public OaiDataProvider(){

}

public OaiDataProvider(String name, String url, String identifier) {
public OaiDataProvider(String name, String url,String downloadUrl, String identifier) {
this.name = name;
this.url = url;
this.downloadUrl = downloadUrl;
this.identifier = identifier;
}

public OaiDataProvider(String url,String downloadUrl) {
this.url = url;
this.downloadUrl = downloadUrl;
}

public OaiDataProvider(String url) {
this.url = url;
}
Expand All @@ -32,6 +38,9 @@ public OaiDataProvider(String url) {
@Column
private String url;

@Column
private String downloadUrl;

@Column
private String identifier;

Expand Down Expand Up @@ -69,5 +78,12 @@ public OaiDataProvider setIdentifier(String identifier) {
return this;
}

public String getDownloadUrl() {
return downloadUrl;
}

public void setDownloadUrl(String downloadUrl) {
this.downloadUrl = downloadUrl;
}

}
25 changes: 18 additions & 7 deletions src/main/java/io/academic/service/AcademicSearchService.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import com.google.gson.JsonParser;
import org.elasticsearch.action.search.*;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.index.query.QueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.Scroll;
import org.elasticsearch.search.SearchHit;
Expand All @@ -16,6 +17,8 @@
import org.springframework.stereotype.Service;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import static org.elasticsearch.index.query.QueryBuilders.matchQuery;

Expand All @@ -30,8 +33,11 @@ public AcademicSearchService() {

public String search(String q) throws IOException {

ArrayList<String> criterias = new ArrayList<String>();
criterias.add("dc");
criterias.add("content");
SearchRequest searchRequest = new SearchRequest("harvester");
searchRequest.source(buildSource("term","dc",q,false));
searchRequest.source(buildSource("term",criterias,q,false));

//this values are necessary if we need scrollable results (in other words if our result have more than 10 hits)
final Scroll scroll = new Scroll(TimeValue.timeValueMinutes(1));
Expand All @@ -47,8 +53,10 @@ public String search(String q) throws IOException {

public String searchBy(String q, String criteria) throws IOException {

ArrayList<String> criterias = new ArrayList<String>();
criterias.add(criteria);
SearchRequest searchRequest = new SearchRequest("harvester");
searchRequest.source(buildSource("match",criteria,q,false));
searchRequest.source(buildSource("match",criterias,q,false));

//this values are necessary if we need scrollable results (in other words if our result have more than 10 hits)
final Scroll scroll = new Scroll(TimeValue.timeValueMinutes(1));
Expand All @@ -66,8 +74,10 @@ public String searchBy(String q, String criteria) throws IOException {

public String getAll() throws IOException {

ArrayList<String> criterias = new ArrayList<String>();
criterias.add("");
SearchRequest searchRequest = new SearchRequest("harvester");
searchRequest.source(buildSource("matchAll","","",true));
searchRequest.source(buildSource("matchAll",criterias,"",true));

//this values are necessary if we need scrollable results (in other words if our result have more than 10 hits)
final Scroll scroll = new Scroll(TimeValue.timeValueMinutes(1));
Expand Down Expand Up @@ -128,16 +138,16 @@ public String toJson(String nonJsonString){



public SearchSourceBuilder buildSource(String queryType, String criteria, String q, Boolean showAllFields){
public SearchSourceBuilder buildSource(String queryType, ArrayList<String> criteria, String q, Boolean showAllFields){
SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder();

if (queryType.equals("match"))
{
searchSourceBuilder.query(matchQuery(criteria,q));
searchSourceBuilder.query(QueryBuilders.matchQuery(criteria.get(0),q));
}
else if (queryType.equals("term"))
{
searchSourceBuilder.query(QueryBuilders.termQuery(criteria,q));
searchSourceBuilder.query(QueryBuilders.multiMatchQuery(q,criteria.toArray(new String[criteria.size()])));
}
else
{
Expand All @@ -147,7 +157,8 @@ else if (queryType.equals("term"))
searchSourceBuilder.sort(new FieldSortBuilder("title.keyword").order(SortOrder.DESC));
if (!showAllFields)
{
String[] includeFields = new String[] {"title",criteria};
criteria.add("title");
String[] includeFields = criteria.toArray(new String[criteria.size()]);
String[] excludeFields = new String[] {""};
searchSourceBuilder.fetchSource(includeFields,excludeFields);
searchSourceBuilder.fetchSource(true);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ public boolean submitUrl(String url) throws InterruptedException {
public String addRule(String url)
{
// String rule = "?metadataPrefix=oai_dc&verb=ListRecords";
String rule = "?from=2018-01-07&until=2018-01-08&metadataPrefix=oai_dc&verb=ListRecords";
String rule = "?from=2017-01-01&until=2017-01-02&metadataPrefix=oai_dc&verb=ListRecords";
return url+rule;
}

Expand Down
123 changes: 96 additions & 27 deletions src/main/java/io/academic/service/OaiService.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,20 +9,23 @@
import com.google.gson.GsonBuilder;
import com.google.gson.JsonElement;
import com.google.gson.JsonParser;
import eu.luminis.elastic.document.DocumentService;
import eu.luminis.elastic.document.IndexRequest;
import eu.luminis.elastic.document.UpdateRequest;
import eu.luminis.elastic.index.IndexService;
import eu.luminis.elastic.search.SearchService;
import io.academic.dao.DcDao;
import io.academic.entity.*;
import org.apache.commons.io.IOUtils;
import org.apache.http.HttpHost;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.action.DocWriteResponse;
import org.elasticsearch.action.admin.indices.delete.DeleteIndexRequest;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.action.index.IndexResponse;
import org.elasticsearch.action.search.*;
import org.elasticsearch.action.support.replication.ReplicationResponse;
import org.elasticsearch.client.RestClient;
import org.elasticsearch.client.RestHighLevelClient;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.common.xcontent.XContentType;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.rest.RestStatus;
import org.elasticsearch.search.Scroll;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.builder.SearchSourceBuilder;
Expand All @@ -39,7 +42,13 @@
import org.springframework.stereotype.Service;

import javax.transaction.Transactional;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
Expand Down Expand Up @@ -81,28 +90,30 @@ public RestHighLevelClient getRestClient() {
public static final String INDEX = "harvester";
private static final String TYPE = "oai";

private DocumentService documentService = null;
private IndexService indexService = null;
private SearchService searchService = null;
private IndexRequest request;

@Autowired
public OaiService(DocumentService documentService, IndexService indexService, SearchService searchService) {
this.documentService = documentService;
this.indexService = indexService;
// indexService.createIndex();
this.searchService = searchService;
}

public String elasticSave(Article article) {
IndexRequest request = new IndexRequest(INDEX, TYPE).setEntity(article);
public OaiService()
{

}

if (article.getId() != null) {
request.setId(String.valueOf(article.getId()));
}
public void elasticSave(Article article) throws IOException {
IndexRequest request = new IndexRequest(INDEX,TYPE);
request.setPipeline("academic-pdf");
// before using this pipeline we have to add pipeline to the elasticsearch by following command
// PUT _ingest/pipeline/academic-pdf
// {
// "description": "parse pdfs and index into ES",
// "processors" :
// [
// { "attachment" : { "field": "pdf" } },
// { "remove" : { "field": "pdf" } }
// ]
// }

request.source(new Gson().toJson(article), XContentType.JSON);
IndexResponse indexResponse = restClient.index(request);

return documentService.index(request);
}


Expand Down Expand Up @@ -131,6 +142,7 @@ public void saveRecords(List<RecordType> recordTypes) {
oaiRecord.setState(0);
oaiRecords.add(oaiRecord);

//TODO: we ave to check all the parts name and assigned according to related name not order
String[] parts = parsedDc.getDc().split(";;");
Article article = new Article();
article.setTitle(parts[0].split("::")[1]);
Expand All @@ -140,10 +152,27 @@ public void saveRecords(List<RecordType> recordTypes) {
article.setPublisher(parts[4].split("::")[1]);
article.setDate(parts[5].split("::")[1]);
article.setType(parts[6].split("::")[1]);
if (parts.length>10)
{
String downlaodUrl = parts[10].split("::")[1];
article.setRelation(downlaodUrl);
article.setBase64(UrlPdftoBase64(downlaodUrl));
}
else
{
article.setRelation("not available");
article.setBase64("bm90IGF2YWlsYWJsZQ=="); //it means not available in base 64
}
article.setDc(parsedDc.getDc());
article.setArticleIdentifier(parseIdentifier(oaiRecord.getIdentifier()));

articles.add(article);

elasticSave(article);
try {
elasticSave(article);
} catch (IOException e) {
e.printStackTrace();
}
});

oaiRecordRepository.save(oaiRecords);
Expand All @@ -167,6 +196,31 @@ private String marshallDc(MetadataType metadataType) {
}
}

public String UrlPdftoBase64(String url) {
URL oracle = null;
String base64 = "bm90IGF2YWlsYWJsZQ=="; //means not available
System.out.println(url);
try {
oracle = new URL(url);
URLConnection yc = oracle.openConnection();

BufferedInputStream bis = new BufferedInputStream(yc.getInputStream());

byte bytes[] = IOUtils.toByteArray(bis);
bis.close();
base64 = Base64.getEncoder().encodeToString(bytes);
System.out.println(url);
System.out.println(base64);
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}

return base64;

}

private LocalDateTime parseDateTime(String string) {
LocalDateTime ldt;
DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd['T'HH:mm:ss'Z']");
Expand All @@ -182,12 +236,27 @@ private LocalDateTime parseDateTime(String string) {
// return LocalDateTime.parse(string, formatter);
}

private String parseIdentifier(String oaiId){
String Id="";
Id = oaiId.substring(oaiId.lastIndexOf(':') + 1); // split identifier with ":" and take last part
Id = Id.substring(Id.lastIndexOf('/') + 1); // split identifier with "/" and take last part
return Id;
}

public void delete() throws IOException {

//TODO:check if there is any indices with that name
DeleteIndexRequest request = new DeleteIndexRequest("harvester");
restClient.indices().deleteIndex(request);
public void delete() {

try {
DeleteIndexRequest request = new DeleteIndexRequest("harvester");
restClient.indices().deleteIndex(request);
} catch (ElasticsearchException exception) {
if (exception.status() == RestStatus.NOT_FOUND) {
System.out.println("Index not found and not deleted");
}
} catch (IOException e) {
e.printStackTrace();
}


}

Expand Down
Loading

0 comments on commit fb89840

Please sign in to comment.