From fbee0719067bcef38d49104fe05246a5ddd0b358 Mon Sep 17 00:00:00 2001 From: Eugene Date: Sun, 9 Feb 2020 23:13:28 +0800 Subject: [PATCH] 1, Enable ORC format data view. 2, Refactor code for better extensibility. It won't be that annoying to add other format data like avro now. --- .../eugene/controller/DashboardRenderer.java | 2 +- .../org/eugene/controller/DataParser.java | 7 ++ .../org/eugene/controller/ORCDataParser.java | 12 +++ .../eugene/controller/ParquetDataParser.java | 68 +++++++++++++++++ .../java/org/eugene/controller/Renderer.java | 74 ++++++------------- .../org/eugene/controller/TableRenderer.java | 2 +- .../java/org/eugene/core/orc/ORCReader.java | 73 ++++++++++++++++++ .../java/org/eugene/model/CommonData.java | 33 +++++++++ src/main/java/org/eugene/model/Parquet.java | 2 + .../java/org/eugene/persistent/VirtualDB.java | 35 +++++++++ .../java/org/eugene/ui/CustomizedMenuBar.java | 2 +- src/main/java/org/eugene/ui/Dashboard.java | 18 +---- .../org/eugene/ui/SelectPropertyDialog.java | 7 +- src/main/java/org/eugene/ui/Table.java | 6 +- src/main/java/org/eugene/util/CSVWriter.java | 16 ++-- 15 files changed, 273 insertions(+), 84 deletions(-) create mode 100644 src/main/java/org/eugene/controller/DataParser.java create mode 100644 src/main/java/org/eugene/controller/ORCDataParser.java create mode 100644 src/main/java/org/eugene/controller/ParquetDataParser.java create mode 100644 src/main/java/org/eugene/core/orc/ORCReader.java create mode 100644 src/main/java/org/eugene/model/CommonData.java create mode 100644 src/main/java/org/eugene/persistent/VirtualDB.java diff --git a/src/main/java/org/eugene/controller/DashboardRenderer.java b/src/main/java/org/eugene/controller/DashboardRenderer.java index eaddf86..820e092 100644 --- a/src/main/java/org/eugene/controller/DashboardRenderer.java +++ b/src/main/java/org/eugene/controller/DashboardRenderer.java @@ -12,7 +12,7 @@ public void setDashboard(Dashboard dashboard){ this.dashboard = dashboard; } - public void refreshMetaInfo(Schema schema, File selectedFile, int rowNumber, int columnNumber){ + public void refreshMetaInfo(String schema, File selectedFile, int rowNumber, int columnNumber){ dashboard.refresh(schema, selectedFile, rowNumber, columnNumber); } } diff --git a/src/main/java/org/eugene/controller/DataParser.java b/src/main/java/org/eugene/controller/DataParser.java new file mode 100644 index 0000000..4472654 --- /dev/null +++ b/src/main/java/org/eugene/controller/DataParser.java @@ -0,0 +1,7 @@ +package org.eugene.controller; + +import org.apache.hadoop.fs.Path; + +public abstract class DataParser { + public abstract boolean parseData(Path path); +} diff --git a/src/main/java/org/eugene/controller/ORCDataParser.java b/src/main/java/org/eugene/controller/ORCDataParser.java new file mode 100644 index 0000000..0b7921b --- /dev/null +++ b/src/main/java/org/eugene/controller/ORCDataParser.java @@ -0,0 +1,12 @@ +package org.eugene.controller; + +import org.apache.hadoop.fs.Path; +import org.eugene.core.orc.ORCReader; + +public class ORCDataParser extends DataParser { + @Override + public boolean parseData(Path path) { + ORCReader reader = new ORCReader(); + return reader.read(path); + } +} diff --git a/src/main/java/org/eugene/controller/ParquetDataParser.java b/src/main/java/org/eugene/controller/ParquetDataParser.java new file mode 100644 index 0000000..18f90f3 --- /dev/null +++ b/src/main/java/org/eugene/controller/ParquetDataParser.java @@ -0,0 +1,68 @@ +package org.eugene.controller; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericData; +import org.apache.hadoop.fs.Path; +import org.eugene.core.parquet.ParquetReader; +import org.eugene.model.CommonData; +import org.eugene.model.TableMeta; +import org.eugene.persistent.VirtualDB; +import org.eugene.ui.Notifier; + +import java.util.ArrayList; +import java.util.List; + +public class ParquetDataParser extends DataParser{ + @Override + public boolean parseData(Path path) { + ParquetReader reader = new ParquetReader(); + List originalData = reader.read(path); + if(originalData == null) + { + return false; + } + if (originalData.isEmpty()) { + Notifier.info("The file is empty"); + return false; + } + + GenericData.Record firstRecord = originalData.get(0); + Schema schema = firstRecord.getSchema(); + + int rowNumber = originalData.size(); + List propertyList = new ArrayList<>(); + for (Schema.Field field: schema.getFields()) + { + String property = field.name(); + propertyList.add(property); + } + int columnNumber = propertyList.size(); + TableMeta tableMeta = new TableMeta(); + tableMeta.setRow(rowNumber); + tableMeta.setColumn(columnNumber); + + List> data = new ArrayList<>(); + for (int i = 0; i < originalData.size(); i++) { + GenericData.Record record = originalData.get(i); + List commonRecord = new ArrayList<>(); + for (int j = 0; j < columnNumber; j++) { + if (record.get(j) == null){ + commonRecord.add("NULL"); + }else{ + commonRecord.add(String.valueOf(record.get(j))); + } + } + data.add(commonRecord); + } + + CommonData commonData = new CommonData(); + commonData.setSchema(schema.toString()); + commonData.setData(data); + commonData.setPropertyList(propertyList); + + VirtualDB.getInstance().setCommonData(commonData); + VirtualDB.getInstance().setTableMeta(tableMeta); + + return true; + } +} diff --git a/src/main/java/org/eugene/controller/Renderer.java b/src/main/java/org/eugene/controller/Renderer.java index dc4db70..adb091d 100644 --- a/src/main/java/org/eugene/controller/Renderer.java +++ b/src/main/java/org/eugene/controller/Renderer.java @@ -6,8 +6,10 @@ import org.apache.avro.generic.GenericData; import org.apache.hadoop.fs.Path; import org.eugene.core.parquet.ParquetReader; +import org.eugene.model.CommonData; import org.eugene.model.Parquet; import org.eugene.model.TableMeta; +import org.eugene.persistent.VirtualDB; import org.eugene.ui.*; import java.io.File; @@ -21,15 +23,9 @@ public class Renderer { private DashboardRenderer dashboardRenderer; private List showingList; - private List propertyList; private File selectedFile; - private Parquet parquet; - private TableMeta tableMeta; - - private Schema schema; - public Renderer(Stage stage){ this.stage = stage; tableRenderer = new TableRenderer(); @@ -46,53 +42,30 @@ public void initUI(){ } public boolean loadAndShow(){ - boolean status = prepareData(); - if (status) { - tableRenderer.init(); - showingList = propertyList; - dashboardRenderer.refreshMetaInfo(parquet.getSchema(), selectedFile, tableMeta.getRow(), tableMeta.getColumn()); - tableRenderer.refresh(showingList, propertyList, tableMeta.getRow(), tableMeta.getColumn(), parquet.getData()); - } - return status; - } - - private boolean prepareData(){ FileChooser filechooser = new FileChooser(); selectedFile = filechooser.showOpenDialog(stage); - Path path = new Path(selectedFile.getAbsolutePath()); - ParquetReader reader = new ParquetReader(); - List data = reader.read(path); - if(data == null) - { - return false; - } - if (data.isEmpty()) { - Notifier.info("The file is empty"); + String absolutePath = selectedFile.getAbsolutePath(); + Path path = new Path(absolutePath); + DataParser dataParser; + if (absolutePath.endsWith(".orc")){ + dataParser = new ORCDataParser(); + }else { + dataParser = new ParquetDataParser(); } - parquet = new Parquet(); - parquet.setData(data); - GenericData.Record record = data.get(0); - schema = record.getSchema(); - parquet.setSchema(schema); - int rowNumber = data.size(); - showingList = new ArrayList<>(); - propertyList = new ArrayList<>(); - for (Schema.Field field: schema.getFields()) - { - String property = field.name(); - showingList.add(property); - propertyList.add(property); + boolean status = dataParser.parseData(path); + if (status) { + tableRenderer.init(); + CommonData commonData = VirtualDB.getInstance().getCommonData(); + TableMeta tableMeta = VirtualDB.getInstance().getTableMeta(); + showingList = commonData.getPropertyList(); + dashboardRenderer.refreshMetaInfo(commonData.getSchema(), selectedFile, tableMeta.getRow(), tableMeta.getColumn()); + tableRenderer.refresh(showingList, commonData.getPropertyList(), tableMeta.getRow(), tableMeta.getColumn(), commonData.getData()); } - int columnNumber = propertyList.size(); - tableMeta = new TableMeta(); - tableMeta.setRow(rowNumber); - tableMeta.setColumn(columnNumber); - - return true; + return status; } - public List getData(){ - return parquet.getData(); + public List> getData(){ + return VirtualDB.getInstance().getCommonData().getData(); } public void refreshTable(){ @@ -100,10 +73,9 @@ public void refreshTable(){ } public void refreshTable(List showingList){ - tableRenderer.refresh(showingList, propertyList, tableMeta.getRow(), tableMeta.getColumn(), parquet.getData()); + CommonData commonData = VirtualDB.getInstance().getCommonData(); + TableMeta tableMeta = VirtualDB.getInstance().getTableMeta(); + tableRenderer.refresh(showingList, commonData.getPropertyList(), tableMeta.getRow(), tableMeta.getColumn(), commonData.getData()); } - public Schema getSchema() { - return schema; - } } diff --git a/src/main/java/org/eugene/controller/TableRenderer.java b/src/main/java/org/eugene/controller/TableRenderer.java index 975d5a9..7872c6c 100644 --- a/src/main/java/org/eugene/controller/TableRenderer.java +++ b/src/main/java/org/eugene/controller/TableRenderer.java @@ -18,7 +18,7 @@ public void setTable(Table table){ this.table = table; } - public void refresh(List showingList, List propertyList, int rowNumber, int columnNumber, List data){ + public void refresh(List showingList, List propertyList, int rowNumber, int columnNumber, List> data){ table.refresh(showingList, propertyList, rowNumber, columnNumber, data); } diff --git a/src/main/java/org/eugene/core/orc/ORCReader.java b/src/main/java/org/eugene/core/orc/ORCReader.java new file mode 100644 index 0000000..8848937 --- /dev/null +++ b/src/main/java/org/eugene/core/orc/ORCReader.java @@ -0,0 +1,73 @@ +package org.eugene.core.orc; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.io.orc.OrcFile; +import org.apache.hadoop.hive.ql.io.orc.Reader; +import org.apache.hadoop.hive.ql.io.orc.RecordReader; + +import org.apache.hadoop.hive.serde2.objectinspector.StructField; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.eugene.model.CommonData; +import org.eugene.model.TableMeta; +import org.eugene.persistent.VirtualDB; +import org.eugene.ui.Notifier; + +import java.util.ArrayList; +import java.util.List; + +public class ORCReader { + public boolean read(Path path){ + try{ + Reader reader = OrcFile.createReader(path, OrcFile.readerOptions(new Configuration())); + StructObjectInspector inspector = (StructObjectInspector)reader.getObjectInspector(); + String schema = reader.getSchema().toJson(); + //The JSON schema provided is illegal, so need to make it valid firstly + schema = schema.replaceAll("(\"[\\w]+\"):([\\s]+[{]+)", "$1,$2"); + RecordReader records = reader.rows(); + //These objects are the metadata for each column. They give you the type of each column and can parse it unless you + //want to parse each column yourself + List fields = inspector.getAllStructFieldRefs(); + List propertyList = new ArrayList<>(); + int columnNumber = fields.size(); + for(int i = 0; i < fields.size(); ++i) { + propertyList.add(((StructField)fields.get(i)).getFieldObjectInspector().getTypeName()); + } + + Object row = null; + List> data = new ArrayList<>(); + while(records.hasNext()) + { + row = records.next(row); + List list = inspector.getStructFieldsDataAsList(row); + StringBuilder builder = new StringBuilder(); + List record = new ArrayList<>(); + for(Object field : list) { + if(field != null){ + record.add(field.toString()); + } + else{ + record.add("NULL"); + } + } + data.add(record); + } + CommonData commonData = new CommonData(); + commonData.setPropertyList(propertyList); + commonData.setSchema(schema); + commonData.setData(data); + TableMeta tableMeta = new TableMeta(); + tableMeta.setColumn(columnNumber); + tableMeta.setRow(data.size()); + VirtualDB.getInstance().setCommonData(commonData); + VirtualDB.getInstance().setTableMeta(tableMeta); + return true; + }catch(Exception e){ + e.printStackTrace(); + Notifier.error("Failed to load the file! The exception throws is: " + e.getMessage()); + return false; + } + + } + +} diff --git a/src/main/java/org/eugene/model/CommonData.java b/src/main/java/org/eugene/model/CommonData.java new file mode 100644 index 0000000..00b3dc4 --- /dev/null +++ b/src/main/java/org/eugene/model/CommonData.java @@ -0,0 +1,33 @@ +package org.eugene.model; + +import java.util.List; + +public class CommonData { + private String schema; + private List> data; + private List propertyList; + + public void setSchema(String schema){ + this.schema = schema; + } + + public String getSchema(){ + return schema; + } + + public void setData(List> data){ + this.data = data; + } + + public List> getData(){ + return data; + } + + public void setPropertyList(List propertyList){ + this.propertyList = propertyList; + } + + public List getPropertyList(){ + return propertyList; + } +} diff --git a/src/main/java/org/eugene/model/Parquet.java b/src/main/java/org/eugene/model/Parquet.java index 9de9730..aa1f132 100644 --- a/src/main/java/org/eugene/model/Parquet.java +++ b/src/main/java/org/eugene/model/Parquet.java @@ -6,6 +6,7 @@ import java.util.List; public class Parquet { + /** private Schema schema; private List data; @@ -24,4 +25,5 @@ public List getData(){ public void setData(List data){ this.data = data; } + **/ } diff --git a/src/main/java/org/eugene/persistent/VirtualDB.java b/src/main/java/org/eugene/persistent/VirtualDB.java new file mode 100644 index 0000000..598b23e --- /dev/null +++ b/src/main/java/org/eugene/persistent/VirtualDB.java @@ -0,0 +1,35 @@ +package org.eugene.persistent; + +import org.apache.calcite.avatica.proto.Common; +import org.eugene.model.CommonData; +import org.eugene.model.TableMeta; + +public class VirtualDB { + private static VirtualDB instance = new VirtualDB(); + private CommonData commonData; + private TableMeta tableMeta; + + private VirtualDB(){ + + } + + public static VirtualDB getInstance(){ + return instance; + } + + public void setCommonData(CommonData commonData){ + this.commonData = commonData; + } + + public CommonData getCommonData(){ + return commonData; + } + + public void setTableMeta(TableMeta tableMeta){ + this.tableMeta = tableMeta; + } + + public TableMeta getTableMeta(){ + return tableMeta; + } +} diff --git a/src/main/java/org/eugene/ui/CustomizedMenuBar.java b/src/main/java/org/eugene/ui/CustomizedMenuBar.java index 27423dd..2e03543 100644 --- a/src/main/java/org/eugene/ui/CustomizedMenuBar.java +++ b/src/main/java/org/eugene/ui/CustomizedMenuBar.java @@ -47,7 +47,7 @@ public CustomizedMenuBar(Stage stage){ FileChooser fileChooser = new FileChooser(); File csvFile = fileChooser.showSaveDialog(stage); Path path = new Path(csvFile.getAbsolutePath()); - ArrayList list = (ArrayList) renderer.getData(); + ArrayList> list = (ArrayList>) renderer.getData(); CSVWriter.write(new Path(csvFile.getAbsolutePath()), list); }); MenuItem close = new MenuItem("Close"); diff --git a/src/main/java/org/eugene/ui/Dashboard.java b/src/main/java/org/eugene/ui/Dashboard.java index efabb14..15be8bd 100644 --- a/src/main/java/org/eugene/ui/Dashboard.java +++ b/src/main/java/org/eugene/ui/Dashboard.java @@ -33,7 +33,7 @@ public void setVBox(VBox vBox){ this.vBox = vBox; } - public void refresh(Schema schema, File selectedFile, int rowNumber, int columnNumber){ + public void refresh(String schema, File selectedFile, int rowNumber, int columnNumber){ vBox.getChildren().clear(); Accordion accordion = new Accordion(); refreshSummaryPane(selectedFile, rowNumber, columnNumber, accordion); @@ -72,26 +72,16 @@ private void refreshSummaryPane(File selectedFile, int rowNumber, int columnNumb accordion.getPanes().add(summaryPane); } - private void refreshMetaPane(Schema schema, Accordion accordion){ - String schemaJson = schema.toString(); + private void refreshMetaPane(String schema, Accordion accordion){ Gson gson = new GsonBuilder().setPrettyPrinting().create(); JsonParser jp = new JsonParser(); - JsonElement je = jp.parse(schemaJson); + JsonElement je = jp.parse(schema); String prettySchemaJson = gson.toJson(je); Map schemaMap = new HashMap<>(); - for (Schema.Field field: schema.getFields()) - { - String name = field.name(); - String type = TypeFetcher.getType(field.schema().toString()); - schemaMap.put(name,type); - } - TitledPane metaPane = new TitledPane(); metaPane.setText("Schema Information"); VBox metaBox = new VBox(); - schemaMap.forEach((k,v) -> { - //metaBox.getChildren().add(new Label(k + " : " + v)); - }); + TextArea textArea = new TextArea(); textArea.setWrapText(true); textArea.setEditable(false); diff --git a/src/main/java/org/eugene/ui/SelectPropertyDialog.java b/src/main/java/org/eugene/ui/SelectPropertyDialog.java index 2f38d57..1019dae 100644 --- a/src/main/java/org/eugene/ui/SelectPropertyDialog.java +++ b/src/main/java/org/eugene/ui/SelectPropertyDialog.java @@ -10,6 +10,7 @@ import org.apache.avro.Schema; import org.eugene.controller.Renderer; import org.eugene.controller.TableRenderer; +import org.eugene.persistent.VirtualDB; import java.util.ArrayList; import java.util.List; @@ -18,11 +19,7 @@ class SelectPropertyDialog { private Dialog> dialog; public void init(Stage stage, Renderer renderer){ - Schema schema = renderer.getSchema(); - List properties = new ArrayList<>(); - for (Schema.Field field: schema.getFields()){ - properties.add(field.name()); - } + List properties = VirtualDB.getInstance().getCommonData().getPropertyList(); dialog = new Dialog<>(); VBox vBox = new VBox(); VBox checkBoxGroup = new VBox(2); diff --git a/src/main/java/org/eugene/ui/Table.java b/src/main/java/org/eugene/ui/Table.java index 737a14a..d7aa90f 100644 --- a/src/main/java/org/eugene/ui/Table.java +++ b/src/main/java/org/eugene/ui/Table.java @@ -43,7 +43,7 @@ public void initTable(){ - public void refresh(List showingList, List propertyList, int rowNumber, int columnNumber, List data){ + public void refresh(List showingList, List propertyList, int rowNumber, int columnNumber, List> data){ initTable(); int index = 0; @@ -77,7 +77,7 @@ public void refresh(List showingList, List propertyList, int row }); } - private void generatePage(List data, TableView tableView, int pageIndex, int pageRowNum, int colNumber, List showingList, List propertyList){ + private void generatePage(List> data, TableView tableView, int pageIndex, int pageRowNum, int colNumber, List showingList, List propertyList){ ObservableList> content = FXCollections.observableArrayList(); int start = pageIndex * pageRowNum; int end = start + pageRowNum; @@ -85,7 +85,7 @@ private void generatePage(List data, TableView tableView, in end = data.size(); } for (int i = start; i < end; i++) { - GenericData.Record r = data.get(i); + List r = data.get(i); List row = new ArrayList(); int index = 0; for (int j = 0; j < colNumber; j++){ diff --git a/src/main/java/org/eugene/util/CSVWriter.java b/src/main/java/org/eugene/util/CSVWriter.java index 87da58c..97a8b51 100644 --- a/src/main/java/org/eugene/util/CSVWriter.java +++ b/src/main/java/org/eugene/util/CSVWriter.java @@ -3,31 +3,31 @@ import org.apache.avro.Schema; import org.apache.avro.generic.GenericData; import org.apache.hadoop.fs.Path; +import org.eugene.persistent.VirtualDB; import java.io.PrintWriter; import java.util.List; public class CSVWriter { - public static boolean write(Path path, List list){ + public static boolean write(Path path, List> data){ try{ PrintWriter out = new PrintWriter(path.toString()); - if (list.size() == 0) { + if (data.size() == 0) { out.write(""); return true; } - GenericData.Record firstRow = list.get(0); - List fields = firstRow.getSchema().getFields(); - int colNumber = fields.size(); + List propertyList = VirtualDB.getInstance().getCommonData().getPropertyList(); + int colNumber = propertyList.size(); for (int i = 0; i < colNumber; i++) { if (i == (colNumber - 1)){ - out.println(fields.get(i).name()); + out.println(propertyList.get(i)); }else{ - out.print(fields.get(i).name()); + out.print(propertyList.get(i)); out.print(","); } } - for (GenericData.Record record: list) { + for (List record: data) { for (int i = 0; i < colNumber; i++) { if (i == (colNumber - 1)) { if (record.get(i) == null)