Skip to content

Commit

Permalink
Add CP1144 code page (Italy) which is same as CP280 with 5A is replac…
Browse files Browse the repository at this point in the history
…ed with the "€" (euro) character.
  • Loading branch information
yruslan committed Jan 14, 2025
1 parent e690708 commit 2f9309a
Show file tree
Hide file tree
Showing 6 changed files with 86 additions and 1 deletion.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -1644,6 +1644,7 @@ The output looks like this:
| .option("ebcdic_code_page", "cp1141") | EBCDIC 1141 | Same as code page 273 with € at the position of the international currency symbol ¤. |
| .option("ebcdic_code_page", "cp1142") | EBCDIC 1142 | Same as code page 277 with € at the position of the international currency symbol ¤. |
| .option("ebcdic_code_page", "cp1143") | EBCDIC 1143 | Same as code page 278 with € at the position of the international currency symbol ¤. |
| .option("ebcdic_code_page", "cp1144") | EBCDIC 1144 | Same as code page 280 with € at the position of the international currency symbol ¤. |
| .option("ebcdic_code_page", "cp1145") | EBCDIC 1145 | Same as code page 284 with € at the position of the international currency symbol ¤. |
| .option("ebcdic_code_page", "cp1148") | EBCDIC 1148 | Same as code page 500 with € at the position of the international currency symbol ¤. |
| .option("ebcdic_code_page", "cp1364") | EBCDIC 1364 | Double-byte code page CCSID-1364, Korean. |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ object CodePage extends Logging {
case "cp1141" => new CodePage1141
case "cp1142" => new CodePage1142
case "cp1143" => new CodePage1143
case "cp1144" => new CodePage1144
case "cp1145" => new CodePage1145
case "cp1146" => new CodePage1146
case "cp1148" => new CodePage1148
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
/*
* Copyright 2018 ABSA Group Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package za.co.absa.cobrix.cobol.parser.encoding.codepage

/**
* EBCDIC code page 1144. Italy.
*
* It corresponds to code page 280 and only differs from it in position 9F, where the euro sign € is located instead
* of the international currency symbol ¤.
*/
class CodePage1144 extends SingleByteCodePage(CodePage1144.ebcdicToAsciiMapping) {
override def codePageShortName: String = "cp1144"
}

object CodePage1144 {
val ebcdicToAsciiMapping: Array[Char] = {
import EbcdicNonPrintable._

/* This is the EBCDIC Code Page 1144 to ASCII conversion table
from https://en.wikibooks.org/wiki/Character_Encodings/Code_Tables/EBCDIC/EBCDIC_280 */
val ebcdic2ascii: Array[Char] = {
// Non-printable characters map used: http://www.pacsys.com/asciitab.htm
Array[Char](
c00, c01, c02, c03, spc, c09, spc, del, spc, spc, spc, c0b, c0c, ccr, c0e, c0f, // 0 - 15
c10, c11, c12, c13, spc, nel, c08, spc, c18, c19, spc, spc, c1c, c1d, c1e, c1f, // 16 - 31
spc, spc, spc, spc, spc, clf, c17, c1b, spc, spc, spc, spc, spc, c05, c06, c07, // 32 - 47
spc, spc, c16, spc, spc, spc, spc, c04, spc, spc, spc, spc, c14, c15, spc, c1a, // 48 - 63
' ', rsp, 'â', 'ä', '{', 'á', 'ã', 'å', bsh, 'ñ', '°', '.', '<', '(', '+', '!', // 64 - 79
'&', ']', 'ê', 'ë', '}', 'í', 'î', 'ï', '~', 'ß', 'é', '$', '*', ')', ';', '^', // 80 - 95
'-', '/', 'Â', 'Ä', 'À', 'Á', 'Ã', 'Å', 'Ç', 'Ñ', 'ò', ',', '%', '_', '>', '?', // 96 - 111
'ø', 'É', 'Ê', 'Ë', 'È', 'Í', 'Î', 'Ï', 'Ì', 'ù', ':', '£', '§', qts, '=', qtd, // 112 - 127
'Ø', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', '«', '»', 'ð', 'ý', 'þ', '±', // 128 - 143
'[', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 'ª', 'º', 'æ', '¸', 'Æ', '€', // 144 - 159
'µ', 'ì', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '¡', '¿', 'Ð', 'Ý', 'Þ', '®', // 160 - 175
'¢', '#', '¥', '·', '©', '@', '¶', '¼', '½', '¾', '¬', '|', '¯', '¨', '´', '×', // 176 - 191
'à', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', shy, 'ô', 'ö', '¦', 'ó', 'õ', // 192 - 207
'è', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', '¹', 'û', 'ü', '`', 'ú', 'ÿ', // 208 - 223
'ç', '÷', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '²', 'Ô', 'Ö', 'Ò', 'Ó', 'Õ', // 224 - 239
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '³', 'Û', 'Ü', 'Ù', 'Ú', spc) // 240 - 255
}
ebcdic2ascii
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,28 @@ class StringDecodersSpec extends AnyWordSpec {
assert(actual == expected)
}

"decode a CP1144 string example" in {
val expected = "âä{áãå\\ñ°.<(+!&]êë}íîï~ßé$*);^-/ÂÄÀÁÃÅÇÑò,%_>?øÉÊËÈÍÎÏÌù:£§'=\"Øabcdefghi«»ðýþ±[jklmnopqrªºæ¸Æ€µìstuvwxyz¡¿ÐÝÞ®¢#¥·©@¶¼½¾¬|¯¨´×àABCDEFGHI\u00ADôö¦óõèJKLMNOPQR¹ûü`úÿç÷STUVWXYZ²ÔÖÒÓÕ0123456789³ÛÜÙÚ"
val bytes = Array(
0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F,
0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F,
0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,
0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F,
0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF,
0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE
).map(_.toByte)

val actual = decodeEbcdicString(bytes, KeepAll, new CodePage1144, improvedNullDetection = false)

assert(actual == expected)
}

"decode a CP1145 string special characters" in {
val expected = " äÑ|üܬ§ñ]ߢ[Ö{}æö¨å@ÆØÅÄÉ€ "
val bytes = Array(0x40, 0x43, 0x7B, 0x4F, 0xDC, 0xFC, 0x5F, 0xB5, 0x6A, 0x5A, 0x59,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,11 @@ class CodePageSingleByteSpec extends AnyFunSuite {
assert(codePage.codePageShortName == "cp1143")
}

test("Ensure codepage 'cp1144' gives the associated CodePage") {
val codePage = CodePage.getCodePageByName("cp1144")
assert(codePage.codePageShortName == "cp1144")
}

test("Ensure codepage 'cp1145' gives the associated CodePage") {
val codePage = CodePage.getCodePageByName("cp1145")
assert(codePage.codePageShortName == "cp1145")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@ private[source] object IndexBuilder extends Logging {
buildIndexForVarLenReader(filesList, reader, sqlContext)
case _ =>
buildIndexForFullFiles(filesList, sqlContext)
case _ => null
}
}

Expand Down

0 comments on commit 2f9309a

Please sign in to comment.