-
Notifications
You must be signed in to change notification settings - Fork 54
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
2, Refactor code for better extensibility. It won't be that annoying to add other format data like avro now.
- Loading branch information
Eugene
committed
Feb 9, 2020
1 parent
129396a
commit fbee071
Showing
15 changed files
with
273 additions
and
84 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
package org.eugene.controller; | ||
|
||
import org.apache.hadoop.fs.Path; | ||
|
||
public abstract class DataParser { | ||
public abstract boolean parseData(Path path); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
package org.eugene.controller; | ||
|
||
import org.apache.hadoop.fs.Path; | ||
import org.eugene.core.orc.ORCReader; | ||
|
||
public class ORCDataParser extends DataParser { | ||
@Override | ||
public boolean parseData(Path path) { | ||
ORCReader reader = new ORCReader(); | ||
return reader.read(path); | ||
} | ||
} |
68 changes: 68 additions & 0 deletions
68
src/main/java/org/eugene/controller/ParquetDataParser.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
package org.eugene.controller; | ||
|
||
import org.apache.avro.Schema; | ||
import org.apache.avro.generic.GenericData; | ||
import org.apache.hadoop.fs.Path; | ||
import org.eugene.core.parquet.ParquetReader; | ||
import org.eugene.model.CommonData; | ||
import org.eugene.model.TableMeta; | ||
import org.eugene.persistent.VirtualDB; | ||
import org.eugene.ui.Notifier; | ||
|
||
import java.util.ArrayList; | ||
import java.util.List; | ||
|
||
public class ParquetDataParser extends DataParser{ | ||
@Override | ||
public boolean parseData(Path path) { | ||
ParquetReader reader = new ParquetReader(); | ||
List<GenericData.Record> originalData = reader.read(path); | ||
if(originalData == null) | ||
{ | ||
return false; | ||
} | ||
if (originalData.isEmpty()) { | ||
Notifier.info("The file is empty"); | ||
return false; | ||
} | ||
|
||
GenericData.Record firstRecord = originalData.get(0); | ||
Schema schema = firstRecord.getSchema(); | ||
|
||
int rowNumber = originalData.size(); | ||
List<String> propertyList = new ArrayList<>(); | ||
for (Schema.Field field: schema.getFields()) | ||
{ | ||
String property = field.name(); | ||
propertyList.add(property); | ||
} | ||
int columnNumber = propertyList.size(); | ||
TableMeta tableMeta = new TableMeta(); | ||
tableMeta.setRow(rowNumber); | ||
tableMeta.setColumn(columnNumber); | ||
|
||
List<List<String>> data = new ArrayList<>(); | ||
for (int i = 0; i < originalData.size(); i++) { | ||
GenericData.Record record = originalData.get(i); | ||
List<String> commonRecord = new ArrayList<>(); | ||
for (int j = 0; j < columnNumber; j++) { | ||
if (record.get(j) == null){ | ||
commonRecord.add("NULL"); | ||
}else{ | ||
commonRecord.add(String.valueOf(record.get(j))); | ||
} | ||
} | ||
data.add(commonRecord); | ||
} | ||
|
||
CommonData commonData = new CommonData(); | ||
commonData.setSchema(schema.toString()); | ||
commonData.setData(data); | ||
commonData.setPropertyList(propertyList); | ||
|
||
VirtualDB.getInstance().setCommonData(commonData); | ||
VirtualDB.getInstance().setTableMeta(tableMeta); | ||
|
||
return true; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
package org.eugene.core.orc; | ||
|
||
import org.apache.hadoop.conf.Configuration; | ||
import org.apache.hadoop.fs.Path; | ||
import org.apache.hadoop.hive.ql.io.orc.OrcFile; | ||
import org.apache.hadoop.hive.ql.io.orc.Reader; | ||
import org.apache.hadoop.hive.ql.io.orc.RecordReader; | ||
|
||
import org.apache.hadoop.hive.serde2.objectinspector.StructField; | ||
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; | ||
import org.eugene.model.CommonData; | ||
import org.eugene.model.TableMeta; | ||
import org.eugene.persistent.VirtualDB; | ||
import org.eugene.ui.Notifier; | ||
|
||
import java.util.ArrayList; | ||
import java.util.List; | ||
|
||
public class ORCReader { | ||
public boolean read(Path path){ | ||
try{ | ||
Reader reader = OrcFile.createReader(path, OrcFile.readerOptions(new Configuration())); | ||
StructObjectInspector inspector = (StructObjectInspector)reader.getObjectInspector(); | ||
String schema = reader.getSchema().toJson(); | ||
//The JSON schema provided is illegal, so need to make it valid firstly | ||
schema = schema.replaceAll("(\"[\\w]+\"):([\\s]+[{]+)", "$1,$2"); | ||
RecordReader records = reader.rows(); | ||
//These objects are the metadata for each column. They give you the type of each column and can parse it unless you | ||
//want to parse each column yourself | ||
List fields = inspector.getAllStructFieldRefs(); | ||
List<String> propertyList = new ArrayList<>(); | ||
int columnNumber = fields.size(); | ||
for(int i = 0; i < fields.size(); ++i) { | ||
propertyList.add(((StructField)fields.get(i)).getFieldObjectInspector().getTypeName()); | ||
} | ||
|
||
Object row = null; | ||
List<List<String>> data = new ArrayList<>(); | ||
while(records.hasNext()) | ||
{ | ||
row = records.next(row); | ||
List list = inspector.getStructFieldsDataAsList(row); | ||
StringBuilder builder = new StringBuilder(); | ||
List<String> record = new ArrayList<>(); | ||
for(Object field : list) { | ||
if(field != null){ | ||
record.add(field.toString()); | ||
} | ||
else{ | ||
record.add("NULL"); | ||
} | ||
} | ||
data.add(record); | ||
} | ||
CommonData commonData = new CommonData(); | ||
commonData.setPropertyList(propertyList); | ||
commonData.setSchema(schema); | ||
commonData.setData(data); | ||
TableMeta tableMeta = new TableMeta(); | ||
tableMeta.setColumn(columnNumber); | ||
tableMeta.setRow(data.size()); | ||
VirtualDB.getInstance().setCommonData(commonData); | ||
VirtualDB.getInstance().setTableMeta(tableMeta); | ||
return true; | ||
}catch(Exception e){ | ||
e.printStackTrace(); | ||
Notifier.error("Failed to load the file! The exception throws is: " + e.getMessage()); | ||
return false; | ||
} | ||
|
||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
package org.eugene.model; | ||
|
||
import java.util.List; | ||
|
||
public class CommonData { | ||
private String schema; | ||
private List<List<String>> data; | ||
private List<String> propertyList; | ||
|
||
public void setSchema(String schema){ | ||
this.schema = schema; | ||
} | ||
|
||
public String getSchema(){ | ||
return schema; | ||
} | ||
|
||
public void setData(List<List<String>> data){ | ||
this.data = data; | ||
} | ||
|
||
public List<List<String>> getData(){ | ||
return data; | ||
} | ||
|
||
public void setPropertyList(List<String> propertyList){ | ||
this.propertyList = propertyList; | ||
} | ||
|
||
public List<String> getPropertyList(){ | ||
return propertyList; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
package org.eugene.persistent; | ||
|
||
import org.apache.calcite.avatica.proto.Common; | ||
import org.eugene.model.CommonData; | ||
import org.eugene.model.TableMeta; | ||
|
||
public class VirtualDB { | ||
private static VirtualDB instance = new VirtualDB(); | ||
private CommonData commonData; | ||
private TableMeta tableMeta; | ||
|
||
private VirtualDB(){ | ||
|
||
} | ||
|
||
public static VirtualDB getInstance(){ | ||
return instance; | ||
} | ||
|
||
public void setCommonData(CommonData commonData){ | ||
this.commonData = commonData; | ||
} | ||
|
||
public CommonData getCommonData(){ | ||
return commonData; | ||
} | ||
|
||
public void setTableMeta(TableMeta tableMeta){ | ||
this.tableMeta = tableMeta; | ||
} | ||
|
||
public TableMeta getTableMeta(){ | ||
return tableMeta; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.