diff --git a/geomesa-convert/geomesa-convert-shp/src/main/scala/org/locationtech/geomesa/convert/shp/ShapefileConverter.scala b/geomesa-convert/geomesa-convert-shp/src/main/scala/org/locationtech/geomesa/convert/shp/ShapefileConverter.scala index 9400569eaf46..5b78de348cc7 100644 --- a/geomesa-convert/geomesa-convert-shp/src/main/scala/org/locationtech/geomesa/convert/shp/ShapefileConverter.scala +++ b/geomesa-convert/geomesa-convert-shp/src/main/scala/org/locationtech/geomesa/convert/shp/ShapefileConverter.scala @@ -25,7 +25,6 @@ import org.locationtech.geomesa.utils.text.TextTools import java.io.InputStream import java.nio.charset.Charset import java.nio.file.{Files, Paths} -import java.util.Collections import scala.collection.mutable.ArrayBuffer class ShapefileConverter(sft: SimpleFeatureType, config: BasicConfig, fields: Seq[BasicField], options: BasicOptions) @@ -50,7 +49,7 @@ class ShapefileConverter(sft: SimpleFeatureType, config: BasicConfig, fields: Se throw new IllegalArgumentException(s"Shapefile converter requires '${EvaluationContext.InputFilePathKey}' " + "to be set in the evaluation context") } - val ds = ShapefileConverter.getDataStore(path) + val ds = ShapefileConverter.getDataStore(path, options.encoding) val schema = ds.getSchema() (ec.accessor(InputSchemaKey).apply(), ec.accessor(InputValuesKey).apply()) match { @@ -119,16 +118,16 @@ object ShapefileConverter extends LazyLogging { * @param path input path * @return */ - def getDataStore(path: String): ShapefileDataStore = { - val params = Collections.singletonMap(ShapefileDataStoreFactory.URLP.key, PathUtils.getUrl(path)) + def getDataStore(path: String, charset: Charset): ShapefileDataStore = { + val params = java.util.Map.of( + ShapefileDataStoreFactory.URLP.key, PathUtils.getUrl(path), + ShapefileDataStoreFactory.DBFCHARSET.key, charset + ) val ds = DataStoreFinder.getDataStore(params).asInstanceOf[ShapefileDataStore] - tryInferCharsetFromCPG(path) match { - case Some(charset) => ds.setCharset(charset) - case None => - } if (ds == null) { throw new IllegalArgumentException(s"Could not read shapefile using path '$path'") } + tryInferCharsetFromCPG(path).foreach(ds.setCharset) ds } @@ -137,7 +136,7 @@ object ShapefileConverter extends LazyLogging { val shpDirPath = Paths.get(path).getParent val (baseName, _) = PathUtils.getBaseNameAndExtension(path) val cpgPath = shpDirPath.resolve(baseName + ".cpg") - if (!Files.isRegularFile(cpgPath)) None else { + if (!Files.isRegularFile(cpgPath)) { None } else { val source = scala.io.Source.fromFile(cpgPath.toFile) try { source.getLines.take(1).toList match { diff --git a/geomesa-convert/geomesa-convert-shp/src/main/scala/org/locationtech/geomesa/convert/shp/ShapefileConverterFactory.scala b/geomesa-convert/geomesa-convert-shp/src/main/scala/org/locationtech/geomesa/convert/shp/ShapefileConverterFactory.scala index 0d5365d2a3dc..d248b4d5d2ce 100644 --- a/geomesa-convert/geomesa-convert-shp/src/main/scala/org/locationtech/geomesa/convert/shp/ShapefileConverterFactory.scala +++ b/geomesa-convert/geomesa-convert-shp/src/main/scala/org/locationtech/geomesa/convert/shp/ShapefileConverterFactory.scala @@ -8,9 +8,10 @@ package org.locationtech.geomesa.convert.shp -import com.typesafe.config.Config +import com.typesafe.config.{Config, ConfigFactory, ConfigValueFactory} import com.typesafe.scalalogging.LazyLogging import org.geotools.api.feature.simple.SimpleFeatureType +import org.geotools.data.shapefile.ShapefileDataStoreFactory import org.locationtech.geomesa.convert.EvaluationContext import org.locationtech.geomesa.convert.shp.ShapefileConverterFactory.TypeToProcess import org.locationtech.geomesa.convert2.AbstractConverter.{BasicConfig, BasicField, BasicOptions} @@ -20,7 +21,8 @@ import org.locationtech.geomesa.convert2.transforms.Expression.Column import org.locationtech.geomesa.utils.io.WithClose import java.io.InputStream -import scala.util.{Failure, Success, Try} +import java.nio.charset.Charset +import scala.util.{Failure, Try} class ShapefileConverterFactory extends AbstractConverterFactory[ShapefileConverter, BasicConfig, BasicField, BasicOptions]( @@ -44,7 +46,7 @@ class ShapefileConverterFactory } Try { - WithClose(ShapefileConverter.getDataStore(url)) { ds => + WithClose(ShapefileConverter.getDataStore(url, ShapefileConverterFactory.DefaultCharset)) { ds => val fields = sft match { case None => var i = 0 @@ -76,8 +78,17 @@ class ShapefileConverterFactory } } } + + override protected def withDefaults(conf: Config): Config = + super.withDefaults(conf.withFallback(ShapefileConverterFactory.ShpConfigDefaults)) } object ShapefileConverterFactory extends LazyLogging { + val TypeToProcess: String = "shp" + + private val DefaultCharset: Charset = ShapefileDataStoreFactory.DBFCHARSET.getDefaultValue.asInstanceOf[Charset] + + private val ShpConfigDefaults: Config = + ConfigFactory.empty().withValue("options.encoding", ConfigValueFactory.fromAnyRef(DefaultCharset.name())) } diff --git a/geomesa-convert/geomesa-convert-shp/src/test/resources/gis_osm_pofw_free_1.dbf b/geomesa-convert/geomesa-convert-shp/src/test/resources/pofw/gis_osm_pofw_free_1.dbf similarity index 100% rename from geomesa-convert/geomesa-convert-shp/src/test/resources/gis_osm_pofw_free_1.dbf rename to geomesa-convert/geomesa-convert-shp/src/test/resources/pofw/gis_osm_pofw_free_1.dbf diff --git a/geomesa-convert/geomesa-convert-shp/src/test/resources/gis_osm_pofw_free_1.shp b/geomesa-convert/geomesa-convert-shp/src/test/resources/pofw/gis_osm_pofw_free_1.shp similarity index 100% rename from geomesa-convert/geomesa-convert-shp/src/test/resources/gis_osm_pofw_free_1.shp rename to geomesa-convert/geomesa-convert-shp/src/test/resources/pofw/gis_osm_pofw_free_1.shp diff --git a/geomesa-convert/geomesa-convert-shp/src/test/resources/gis_osm_pofw_free_1.shx b/geomesa-convert/geomesa-convert-shp/src/test/resources/pofw/gis_osm_pofw_free_1.shx similarity index 100% rename from geomesa-convert/geomesa-convert-shp/src/test/resources/gis_osm_pofw_free_1.shx rename to geomesa-convert/geomesa-convert-shp/src/test/resources/pofw/gis_osm_pofw_free_1.shx diff --git a/geomesa-convert/geomesa-convert-shp/src/test/resources/gis_osm_pofw_free_1.cpg b/geomesa-convert/geomesa-convert-shp/src/test/resources/pofw_cpg/gis_osm_pofw_free_1.cpg similarity index 100% rename from geomesa-convert/geomesa-convert-shp/src/test/resources/gis_osm_pofw_free_1.cpg rename to geomesa-convert/geomesa-convert-shp/src/test/resources/pofw_cpg/gis_osm_pofw_free_1.cpg diff --git a/geomesa-convert/geomesa-convert-shp/src/test/resources/pofw_cpg/gis_osm_pofw_free_1.dbf b/geomesa-convert/geomesa-convert-shp/src/test/resources/pofw_cpg/gis_osm_pofw_free_1.dbf new file mode 100644 index 000000000000..51fe20144248 Binary files /dev/null and b/geomesa-convert/geomesa-convert-shp/src/test/resources/pofw_cpg/gis_osm_pofw_free_1.dbf differ diff --git a/geomesa-convert/geomesa-convert-shp/src/test/resources/pofw_cpg/gis_osm_pofw_free_1.shp b/geomesa-convert/geomesa-convert-shp/src/test/resources/pofw_cpg/gis_osm_pofw_free_1.shp new file mode 100644 index 000000000000..60c48cb9cf15 Binary files /dev/null and b/geomesa-convert/geomesa-convert-shp/src/test/resources/pofw_cpg/gis_osm_pofw_free_1.shp differ diff --git a/geomesa-convert/geomesa-convert-shp/src/test/resources/pofw_cpg/gis_osm_pofw_free_1.shx b/geomesa-convert/geomesa-convert-shp/src/test/resources/pofw_cpg/gis_osm_pofw_free_1.shx new file mode 100644 index 000000000000..889dbf006fd6 Binary files /dev/null and b/geomesa-convert/geomesa-convert-shp/src/test/resources/pofw_cpg/gis_osm_pofw_free_1.shx differ diff --git a/geomesa-convert/geomesa-convert-shp/src/test/resources/cb_2017_us_state_20m.dbf b/geomesa-convert/geomesa-convert-shp/src/test/resources/us_state/cb_2017_us_state_20m.dbf similarity index 100% rename from geomesa-convert/geomesa-convert-shp/src/test/resources/cb_2017_us_state_20m.dbf rename to geomesa-convert/geomesa-convert-shp/src/test/resources/us_state/cb_2017_us_state_20m.dbf diff --git a/geomesa-convert/geomesa-convert-shp/src/test/resources/cb_2017_us_state_20m.shp b/geomesa-convert/geomesa-convert-shp/src/test/resources/us_state/cb_2017_us_state_20m.shp similarity index 100% rename from geomesa-convert/geomesa-convert-shp/src/test/resources/cb_2017_us_state_20m.shp rename to geomesa-convert/geomesa-convert-shp/src/test/resources/us_state/cb_2017_us_state_20m.shp diff --git a/geomesa-convert/geomesa-convert-shp/src/test/resources/cb_2017_us_state_20m.shx b/geomesa-convert/geomesa-convert-shp/src/test/resources/us_state/cb_2017_us_state_20m.shx similarity index 100% rename from geomesa-convert/geomesa-convert-shp/src/test/resources/cb_2017_us_state_20m.shx rename to geomesa-convert/geomesa-convert-shp/src/test/resources/us_state/cb_2017_us_state_20m.shx diff --git a/geomesa-convert/geomesa-convert-shp/src/test/scala/org/locationtech/geomesa/convert/shp/ShapefileConverterTest.scala b/geomesa-convert/geomesa-convert-shp/src/test/scala/org/locationtech/geomesa/convert/shp/ShapefileConverterTest.scala index 9bb349d45095..e18ef0283dbe 100644 --- a/geomesa-convert/geomesa-convert-shp/src/test/scala/org/locationtech/geomesa/convert/shp/ShapefileConverterTest.scala +++ b/geomesa-convert/geomesa-convert-shp/src/test/scala/org/locationtech/geomesa/convert/shp/ShapefileConverterTest.scala @@ -29,7 +29,7 @@ class ShapefileConverterTest extends Specification { val sft = SimpleFeatureTypes.createType("states", spec) - lazy val shp = this.getClass.getClassLoader.getResource("cb_2017_us_state_20m.shp") + lazy val shp = this.getClass.getClassLoader.getResource("us_state/cb_2017_us_state_20m.shp") lazy val shpFile = Paths.get(shp.toURI).toFile.getAbsolutePath // fields in the shapefile: @@ -106,31 +106,35 @@ class ShapefileConverterTest extends Specification { } } - "parse shapefile with cpg file" in { + "parse shapefile with cpg file or specific encoding" in { val spec = "*the_geom:Point,name:String" val sft = SimpleFeatureTypes.createType("gis_osm_pofw", spec) - lazy val shp = this.getClass.getClassLoader.getResource("gis_osm_pofw_free_1.shp") - lazy val shpFile = Paths.get(shp.toURI).toFile.getAbsolutePath - val conf = ConfigFactory.parseString( - """ - |{ - | "id-field" : "$0", - | "type" : "shp", - | "fields" : [ - | { "name" : "the_geom", "transform" : "$1" }, - | { "name" : "name", "transform" : "$5" } - | ] - |} - """.stripMargin) - - WithClose(SimpleFeatureConverter(sft, conf)) { converter => - converter must not(beNull) - val ec = converter.createEvaluationContext(EvaluationContext.inputFileParam(shpFile)) - val res = SelfClosingIterator(converter.process(shp.openStream(), ec)).toList - - // strings should be properly decoded - res.map(_.getAttribute("name")) must containAllOf(Seq("法海寺", "རུ་ཐོག་དགོན་ (日多寺)", "Pagoda")) + foreach(Seq("pofw_cpg" -> None, "pofw" -> Some("UTF-8"))) { case (dir, encoding) => + val shp = this.getClass.getClassLoader.getResource(s"$dir/gis_osm_pofw_free_1.shp") + val shpFile = Paths.get(shp.toURI).toFile.getAbsolutePath + + val conf = ConfigFactory.parseString( + s""" + |{ + | "id-field" : "$$0", + | "type" : "shp", + | "options": { ${encoding.map(e => s"encoding: $e").getOrElse("")} } + | "fields" : [ + | { "name" : "the_geom", "transform" : "$$1" }, + | { "name" : "name", "transform" : "$$5" } + | ] + |} + """.stripMargin) + + WithClose(SimpleFeatureConverter(sft, conf)) { converter => + converter must not(beNull) + val ec = converter.createEvaluationContext(EvaluationContext.inputFileParam(shpFile)) + val res = SelfClosingIterator(converter.process(shp.openStream(), ec)).toList + + // strings should be properly decoded + res.map(_.getAttribute("name")) must containAllOf(Seq("法海寺", "རུ་ཐོག་དགོན་ (日多寺)", "Pagoda")) + } } } }