Beta, with better structure

steelbreeze · Nov 24, 2021 · 92f7efd · 92f7efd
1 parent c59c643
commit 92f7efd
Show file tree

Hide file tree

Showing 4 changed files with 47 additions and 26 deletions.
diff --git a/lib/node/index.js b/lib/node/index.js
@@ -1,25 +1,34 @@
 "use strict";
 Object.defineProperty(exports, "__esModule", { value: true });
 exports.parse = void 0;
+// unicode character constants
+const lineFeed = '\u000A';
+const carriageReturn = '\u000D';
+const doubleQuote = '\u0022';
+const comma = '\u002C';
+const lineSeperator = '\u2028';
+const paragraphSeperator = '\u2029';
+const byteOrderMark = '\uFEFF';
+// regular expression fragments
+const lineTerminator = `[${lineFeed}${carriageReturn}${lineSeperator}${paragraphSeperator}]+`; // any of the line terminator code points as defined in the ECMAScript® 2019 Language Specification
+const unquoted = `(?=(?:(?:[^${doubleQuote}]*${doubleQuote}){2})*[^${doubleQuote}]*$)`; // a look ahead to ensure whatever is matched in not within double quotes
+// regular expressions used to parse
+const trim = new RegExp(`^${byteOrderMark}|${lineTerminator}$`);
+const rows = new RegExp(`${lineTerminator}${unquoted}`);
+const tokens = new RegExp(`${comma}${unquoted}`);
+const quotes = new RegExp(`(^${doubleQuote}|${doubleQuote}$)`, 'g');
+const doubleDoubleQuotes = new RegExp(`${doubleQuote}${doubleQuote}`, 'g');
 /**
  * Parses a string encoded as a comma seperated values
  * @param text The source csv text.
  * @returns An array of objects.
  */
 function parse(text) {
-    // trim byte order mark from beginning and any trailing EOL if present
-    const tokens = text.replace(/^\uFEFF|[\r\n]+$/, '')
-        // split text into rows at EOL
-        .split(/[\r\n]+(?=(?:(?:[^"]*"){2})*[^"]*$)/).map(row => 
-    // split row into tokens based on comma delimiter (unless in quotes); see answer here: https://stackoverflow.com/questions/23582276/split-string-by-comma-but-ignore-commas-inside-quotes/23582323#23582323
-    row.split(/,(?=(?:(?:[^"]*"){2})*[^"]*$)/)
-        // dequote tokens if needed 
-        .map(token => token.replace(/(^"|"$)/g, '')
-        // replace double double quotes with double quotes
-        .replace(/\"\"/g, '"')));
+    // create a table of tokens from the source csv formatted text
+    const table = text.replace(trim, '').split(rows).map(row => row.split(tokens).map(token => token.replace(quotes, '').replace(doubleDoubleQuotes, doubleQuote)));
     // extract the header row and use for the property names
-    const header = tokens.shift();
+    const header = table.shift();
     // convert subsiquent rows into objects
-    return header ? tokens.map(row => Object.fromEntries(header.map((column, index) => [column, row[index]]))) : [];
+    return header ? table.map(row => Object.fromEntries(header.map((column, index) => [column, row[index]]))) : [];
 }
 exports.parse = parse;
diff --git a/lib/web/csv.min.js b/lib/web/csv.min.js
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@steelbreeze/csv",
-  "version": "1.0.0-alpha.3",
+  "version": "1.0.0-beta",
   "description": "Tools for reading and writnig files formatted as CSV",
   "main": "lib/node/index.js",
   "module": "lib/node/index.js",

diff --git a/src/index.ts b/src/index.ts
@@ -1,23 +1,35 @@
+// unicode character constants
+const lineFeed           = '\u000A';
+const carriageReturn     = '\u000D';
+const doubleQuote        = '\u0022';
+const comma              = '\u002C';
+const lineSeperator      = '\u2028';
+const paragraphSeperator = '\u2029';
+const byteOrderMark      = '\uFEFF';
+
+// regular expression fragments
+const lineTerminator = `[${lineFeed}${carriageReturn}${lineSeperator}${paragraphSeperator}]+`; // any of the line terminator code points as defined in the ECMAScript® 2019 Language Specification
+const unquoted = `(?=(?:(?:[^${doubleQuote}]*${doubleQuote}){2})*[^${doubleQuote}]*$)`; // a look ahead to ensure whatever is matched in not within double quotes
+
+// regular expressions used to parse
+const trim = new RegExp(`^${byteOrderMark}|${lineTerminator}$`);
+const rows = new RegExp(`${lineTerminator}${unquoted}`);
+const tokens = new RegExp(`${comma}${unquoted}`);
+const quotes = new RegExp(`(^${doubleQuote}|${doubleQuote}$)`, 'g');
+const doubleDoubleQuotes = new RegExp(`${doubleQuote}${doubleQuote}`, 'g');
+
 /**
  * Parses a string encoded as a comma seperated values
  * @param text The source csv text.
  * @returns An array of objects.
  */
 export function parse(text: string): Array<any> {
-	// trim byte order mark from beginning and any trailing EOL if present
-	const tokens = text.replace(/^\uFEFF|[\r\n]+$/, '')
-		// split text into rows at EOL (unless in quotes)
-		.split(/\r?\n(?=(?:(?:[^"]*"){2})*[^"]*$)|\r(?=(?:(?:[^"]*"){2})*[^"]*$)/).map(row =>
-			// split row into tokens based on comma delimiter (unless in quotes)
-			row.split(/,(?=(?:(?:[^"]*"){2})*[^"]*$)/)
-				// dequote tokens if needed 
-				.map(token => token.replace(/(^"|"$)/g, '')
-					// replace double double quotes with double quotes
-					.replace(/\"\"/g, '"')));
+	// create a table of tokens from the source csv formatted text
+	const table = text.replace(trim, '').split(rows).map(row => row.split(tokens).map(token => token.replace(quotes, '').replace(doubleDoubleQuotes, doubleQuote)));
 
 	// extract the header row and use for the property names
-	const header = tokens.shift();
+	const header = table.shift();
 
 	// convert subsiquent rows into objects
-	return header ? tokens.map(row => Object.fromEntries(header.map((column, index) => [column, row[index]]))) : [];
+	return header ? table.map(row => Object.fromEntries(header.map((column, index) => [column, row[index]]))) : [];
 }