Skip to content
This repository has been archived by the owner on Oct 22, 2020. It is now read-only.

Commit

Permalink
parsed pdf added to search result
Browse files Browse the repository at this point in the history
  • Loading branch information
fakturk committed Mar 16, 2018
1 parent 6e47e22 commit d9a360c
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 95 deletions.
25 changes: 18 additions & 7 deletions src/main/java/io/academic/service/AcademicSearchService.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import com.google.gson.JsonParser;
import org.elasticsearch.action.search.*;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.index.query.QueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.Scroll;
import org.elasticsearch.search.SearchHit;
Expand All @@ -16,6 +17,8 @@
import org.springframework.stereotype.Service;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import static org.elasticsearch.index.query.QueryBuilders.matchQuery;

Expand All @@ -30,8 +33,11 @@ public AcademicSearchService() {

public String search(String q) throws IOException {

ArrayList<String> criterias = new ArrayList<String>();
criterias.add("dc");
criterias.add("content");
SearchRequest searchRequest = new SearchRequest("harvester");
searchRequest.source(buildSource("term","dc",q,false));
searchRequest.source(buildSource("term",criterias,q,false));

//this values are necessary if we need scrollable results (in other words if our result have more than 10 hits)
final Scroll scroll = new Scroll(TimeValue.timeValueMinutes(1));
Expand All @@ -47,8 +53,10 @@ public String search(String q) throws IOException {

public String searchBy(String q, String criteria) throws IOException {

ArrayList<String> criterias = new ArrayList<String>();
criterias.add(criteria);
SearchRequest searchRequest = new SearchRequest("harvester");
searchRequest.source(buildSource("match",criteria,q,false));
searchRequest.source(buildSource("match",criterias,q,false));

//this values are necessary if we need scrollable results (in other words if our result have more than 10 hits)
final Scroll scroll = new Scroll(TimeValue.timeValueMinutes(1));
Expand All @@ -66,8 +74,10 @@ public String searchBy(String q, String criteria) throws IOException {

public String getAll() throws IOException {

ArrayList<String> criterias = new ArrayList<String>();
criterias.add("");
SearchRequest searchRequest = new SearchRequest("harvester");
searchRequest.source(buildSource("matchAll","","",true));
searchRequest.source(buildSource("matchAll",criterias,"",true));

//this values are necessary if we need scrollable results (in other words if our result have more than 10 hits)
final Scroll scroll = new Scroll(TimeValue.timeValueMinutes(1));
Expand Down Expand Up @@ -128,16 +138,16 @@ public String toJson(String nonJsonString){



public SearchSourceBuilder buildSource(String queryType, String criteria, String q, Boolean showAllFields){
public SearchSourceBuilder buildSource(String queryType, ArrayList<String> criteria, String q, Boolean showAllFields){
SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder();

if (queryType.equals("match"))
{
searchSourceBuilder.query(matchQuery(criteria,q));
searchSourceBuilder.query(QueryBuilders.matchQuery(criteria.get(0),q));
}
else if (queryType.equals("term"))
{
searchSourceBuilder.query(QueryBuilders.termQuery(criteria,q));
searchSourceBuilder.query(QueryBuilders.multiMatchQuery(q,criteria.toArray(new String[criteria.size()])));
}
else
{
Expand All @@ -147,7 +157,8 @@ else if (queryType.equals("term"))
searchSourceBuilder.sort(new FieldSortBuilder("title.keyword").order(SortOrder.DESC));
if (!showAllFields)
{
String[] includeFields = new String[] {"title",criteria};
criteria.add("title");
String[] includeFields = criteria.toArray(new String[criteria.size()]);
String[] excludeFields = new String[] {""};
searchSourceBuilder.fetchSource(includeFields,excludeFields);
searchSourceBuilder.fetchSource(true);
Expand Down
100 changes: 14 additions & 86 deletions src/main/java/io/academic/service/OaiService.java
Original file line number Diff line number Diff line change
Expand Up @@ -90,67 +90,30 @@ public RestHighLevelClient getRestClient() {
public static final String INDEX = "harvester";
private static final String TYPE = "oai";

// private DocumentService documentService = null;
// private IndexService indexService = null;
// private SearchService searchService = null;
// private IndexRequest request;

// @Autowired
// public OaiService(DocumentService documentService, IndexService indexService, SearchService searchService) {
// this.documentService = documentService;
// this.indexService = indexService;
//// indexService.createIndex();
// this.searchService = searchService;
// }

@Autowired
public OaiService()
{

}

public void elasticSave(Article article) throws IOException {
// System.out.println("inside elasticsave");

// IndexRequest request = new IndexRequest(INDEX, TYPE).setEntity(article);
// System.out.println("before article get Article Identifier");
// System.out.println(article.getArticleIdentifier());
// if (article.getArticleIdentifier() != null) {
// request.setId(String.valueOf(article.getId()));
// System.out.println("inside article getid");
IndexRequest request = new IndexRequest(INDEX,TYPE);
request.setPipeline("academic-pdf");
// System.out.println(new Gson().toJson(article));
request.source(new Gson().toJson(article), XContentType.JSON);
// before using this pipeline we have to add pipeline to the elasticsearch by following command
// PUT _ingest/pipeline/academic-pdf
// {
// "description": "parse pdfs and index into ES",
// "processors" :
// [
// { "attachment" : { "field": "pdf" } },
// { "remove" : { "field": "pdf" } }
// ]
// }



request.source(new Gson().toJson(article), XContentType.JSON);
IndexResponse indexResponse = restClient.index(request);

// String index = indexResponse.getIndex();
// String type = indexResponse.getType();
// String id = indexResponse.getId();
// long version = indexResponse.getVersion();
// System.out.println(index+" ,"+type+", "+id+", "+version);
// if (indexResponse.getResult() == DocWriteResponse.Result.CREATED) {
//
// } else if (indexResponse.getResult() == DocWriteResponse.Result.UPDATED) {
//
// }
// ReplicationResponse.ShardInfo shardInfo = indexResponse.getShardInfo();
// if (shardInfo.getTotal() != shardInfo.getSuccessful()) {
//
// }
// if (shardInfo.getFailed() > 0) {
// for (ReplicationResponse.ShardInfo.Failure failure : shardInfo.getFailures()) {
// String reason = failure.reason();
// }
// }




// return documentService.index(request);
}


Expand Down Expand Up @@ -179,11 +142,9 @@ public void saveRecords(List<RecordType> recordTypes) {
oaiRecord.setState(0);
oaiRecords.add(oaiRecord);

//TODO: we ave to check all the parts name and assigned according to related name not order
String[] parts = parsedDc.getDc().split(";;");
Article article = new Article();
// System.out.println("article create sonrasi article id : "+article.getId());
// System.out.println("article create sonrasi oai id : "+oaiRecord.getId());

article.setTitle(parts[0].split("::")[1]);
article.setAuthors(parts[1].split("::")[1]);
article.setKeywords(parts[2].split("::")[1]);
Expand All @@ -195,23 +156,18 @@ public void saveRecords(List<RecordType> recordTypes) {
{
String downlaodUrl = parts[10].split("::")[1];
article.setRelation(downlaodUrl);
// article.setBase64("not available");
article.setBase64(UrlPdftoBase64(downlaodUrl));
}
else
{
article.setRelation("not available");
article.setBase64("bm90IGF2YWlsYWJsZQ==");
article.setBase64("bm90IGF2YWlsYWJsZQ=="); //it means not available in base 64
}
article.setDc(parsedDc.getDc());
article.setArticleIdentifier(parseIdentifier(oaiRecord.getIdentifier()));
// article.setArticleIdentifier(oaiRecord.getIdentifier());

// System.out.println("article add oncesi article id : "+article.getId());
articles.add(article);

// System.out.println("elastic save oncesi article id : "+article.getId());
// System.out.println("elastic save oncesi article title : "+article.getTitle());
try {
elasticSave(article);
} catch (IOException e) {
Expand Down Expand Up @@ -247,13 +203,8 @@ public String UrlPdftoBase64(String url) {
try {
oracle = new URL(url);
URLConnection yc = oracle.openConnection();
// BufferedReader in = new BufferedReader(new InputStreamReader(
// yc.getInputStream()));

BufferedInputStream bis = new BufferedInputStream(yc.getInputStream());
// String inputLine;
// while ((inputLine = in.readLine()) != null)
// System.out.println(inputLine);
// in.close();

byte bytes[] = IOUtils.toByteArray(bis);
bis.close();
Expand All @@ -266,31 +217,8 @@ public String UrlPdftoBase64(String url) {
e.printStackTrace();
}




return base64;

// String inputLine;
// while ((inputLine = in.readLine()) != null)
// System.out.println(inputLine);
// in.close();




// BufferedReader in = new BufferedReader(
// new InputStreamReader(oracle.openStream()));
// byte bytes[] = IOUtils.toByteArray(oracle);


// String b64String = Base64.

// String inputLine;
// while ((inputLine = in.readLine()) != null)
// System.out.println(inputLine);
// in.close();

}

private LocalDateTime parseDateTime(String string) {
Expand Down
3 changes: 1 addition & 2 deletions src/main/java/io/academic/service/ProcessorService.java
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,7 @@ public void startProcessors() {
oaiDataProviderService.queue(new OaiDataProvider("Acta Medica Anatolia","http://dergipark.gov.tr/api/public/oai/","http://dergipark.gov.tr/download/article-file/","dergipark.ulakbim.gov.tr" ));
// oaiDataProviderService.queue(new OaiDataProvider("http://export.arxiv.org/oai2","https://arxiv.org/pdf/"));

oaiService.delete();

oaiService.delete();

}

Expand Down

0 comments on commit d9a360c

Please sign in to comment.