diff --git a/product/roundhouse.console/Program.cs b/product/roundhouse.console/Program.cs index 711bfaa2..fa3d4b64 100644 --- a/product/roundhouse.console/Program.cs +++ b/product/roundhouse.console/Program.cs @@ -351,15 +351,16 @@ private static void parse_arguments_and_set_up_configuration(ConfigurationProper "SearchAllSubdirectoriesInsteadOfTraverse - Each Migration folder's subdirectories are traversed by default. This option pulls back scripts from the main directory and all subdirectories at once. Defaults to 'false'", option => configuration.SearchAllSubdirectoriesInsteadOfTraverse = option != null) .Add("isuptodate", - "This option prints whether there are any database updates or not, whithout actually running them. Other output except errors is disabled, to make it easy to use in scripts.", + "This option prints whether there are any database updates or not, without actually running them. Other output except errors is disabled, to make it easy to use in scripts.", option => { }) - .Add("fileencoding=", - "File Encoding - Explicitly specify the file encoding of all files", + // default encoding + .Add("defaultencoding=", + "Default encoding to use for loading script file from disk if file doesn't contain BOM. For the list of possible values see the column Name in table listed in .NET Encoding class documentation. Defaults to UTF-8", option => { if(option != null) { - configuration.FileEncoding = System.Text.Encoding.GetEncoding(option); + configuration.DefaultEncoding = System.Text.Encoding.GetEncoding(option); } }) //load configuration from file @@ -414,6 +415,7 @@ private static void parse_arguments_and_set_up_configuration(ConfigurationProper "/dryrun " + "/search[allsubdirectories]insteadoftraverse " + "/isuptodate" + + "/defaultencoding VALUE" + "]", Environment.NewLine); show_help(usage_message, option_set); } diff --git a/product/roundhouse.tasks/Roundhouse.cs b/product/roundhouse.tasks/Roundhouse.cs index ab210dac..30a6c136 100644 --- a/product/roundhouse.tasks/Roundhouse.cs +++ b/product/roundhouse.tasks/Roundhouse.cs @@ -189,7 +189,7 @@ public string RecoveryMode public bool SearchAllSubdirectoriesInsteadOfTraverse { get; set; } - public System.Text.Encoding FileEncoding { get; set; } + public System.Text.Encoding DefaultEncoding { get; set; } public bool DisableOutput { get; set; } public Dictionary UserTokens { get; set; } diff --git a/product/roundhouse.tests/infrastructure/filesystem/DotNetFileSystemAccessSpecs.cs b/product/roundhouse.tests/infrastructure/filesystem/DotNetFileSystemAccessSpecs.cs index 05f175f1..8d035019 100644 --- a/product/roundhouse.tests/infrastructure/filesystem/DotNetFileSystemAccessSpecs.cs +++ b/product/roundhouse.tests/infrastructure/filesystem/DotNetFileSystemAccessSpecs.cs @@ -1,3 +1,4 @@ +using System; using System.IO; using NUnit.Framework; @@ -11,45 +12,120 @@ public class DotNetFileSystemAccessSpecs { public abstract class concern_for_file_system : TinySpec { - protected object result; - private DotNetFileSystemAccess dot_net_file_system_access = new DotNetFileSystemAccess(new DefaultConfiguration()); + protected DotNetFileSystemAccess dot_net_file_system_access; + + protected string utf_expected_string; + + protected string very_small_file; + protected string utf8_file_with_bom; + protected string utf8_file_no_bom; + protected string utf16LE_file_with_bom; + protected string utf16BE_file_with_bom; + protected string utf32LE_file_with_bom; + protected string utf32BE_file_with_bom; + protected string ansi_file; - public override void Context() { } protected override DotNetFileSystemAccess sut { get { return dot_net_file_system_access; } set { dot_net_file_system_access = value; } } - } - [Concern(typeof(DotNetFileSystemAccess))] - public class when_reading_files_with_different_formats : concern_for_file_system - { - protected static string utf8_file; - protected static string ansi_file; + private string read_test_file(string file_name) + { + return sut.read_file_text(Path.Combine(TestContext.CurrentContext.TestDirectory, string.Format(@"infrastructure\filesystem\{0}", file_name))); + } public override void Because() { - utf8_file = sut.read_file_text(Path.Combine(TestContext.CurrentContext.TestDirectory, @"infrastructure\filesystem\utf8encoded.txt")); - ansi_file = sut.read_file_text(Path.Combine(TestContext.CurrentContext.TestDirectory, @"infrastructure\filesystem\ansiencoded.txt")); + // NOTE: When testing encodings, its important to do it so the result doesnt depend on how the compiler itself will read the source file. + // Best way to do it is to work with byte representation i.e. do not include any non ASCII characters (> 127) in source code directly... + string unicode_character = Encoding.UTF8.GetString(new byte[] { 0xc3, 0xa3 }); // small "a" with tilde (utf-8) + utf_expected_string = string.Format("INSERT INTO [dbo].[timmy]([value]) VALUES('G{0}')", unicode_character); + + very_small_file = read_test_file("very_small_file.txt"); + utf8_file_with_bom = read_test_file("utf8_with_bom.txt"); + utf8_file_no_bom = read_test_file("utf8_no_bom.txt"); + utf16LE_file_with_bom = read_test_file("utf16LE_with_bom.txt"); + utf16BE_file_with_bom = read_test_file("utf16BE_with_bom.txt"); + utf32LE_file_with_bom = read_test_file("utf32LE_with_bom.txt"); + utf32BE_file_with_bom = read_test_file("utf32BE_with_bom.txt"); + + ansi_file = read_test_file("ansiencoded.txt"); + } + + [Observation] + public void very_small_file_should_read_correctly() + { + very_small_file.should_be_equal_to("GO"); + } + + [Observation] + public void utf8_encoded_file_with_bom_should_read_correctly() + { + utf8_file_with_bom.should_be_equal_to(utf_expected_string); + } + + [Observation] + public void utf16LE_encoded_file_with_bom_should_read_correctly() + { + utf16LE_file_with_bom.should_be_equal_to(utf_expected_string); + } + + [Observation] + public void utf16BE_encoded_file_with_bom_should_read_correctly() + { + utf16BE_file_with_bom.should_be_equal_to(utf_expected_string); } [Observation] - public void utf8_encoded_file_should_read_correctly() + public void utf32LE_encoded_file_with_bom_should_read_correctly() + { + utf32LE_file_with_bom.should_be_equal_to(utf_expected_string); + } + + [Observation] + public void utf32BE_encoded_file_with_bom_should_read_correctly() + { + utf32BE_file_with_bom.should_be_equal_to(utf_expected_string); + } + } + + + [Concern(typeof(DotNetFileSystemAccess))] + public class when_reading_files_with_different_formats_with_default_configuration : concern_for_file_system + { + public override void Context() { - utf8_file.should_be_equal_to("INSERT INTO [dbo].[timmy]([value]) VALUES('Gã')"); + dot_net_file_system_access = new DotNetFileSystemAccess(new DefaultConfiguration()); } [Observation] - public void ansi_encoded_file_should_read_correctly() + public void utf8_encoded_file_without_bom_should_read_correctly() { - ansi_file.should_be_equal_to(to_default_code_page("INSERT INTO [dbo].[timmy]([value]) VALUES('Gã')")); + utf8_file_no_bom.should_be_equal_to(utf_expected_string); } + } + + [Concern(typeof(DotNetFileSystemAccess))] + public class when_reading_files_with_different_formats_with_ansi_encoding_configuration : concern_for_file_system + { + private readonly Encoding ansi_encoding = Encoding.GetEncoding("windows-1252"); - private string to_default_code_page(string english_code_page_text) + public override void Context() { - var bytes = Encoding.GetEncoding(1252).GetBytes(english_code_page_text); - return Encoding.Default.GetString(bytes); + dot_net_file_system_access = new DotNetFileSystemAccess( + new DefaultConfiguration { DefaultEncoding = ansi_encoding } + ); + } + + [Observation] + public void ansi_encoded_file_shoul_read_correctly() + { + // small "a" with tilde in Windows-1252 + string extended_ascii_character = ansi_encoding.GetString(new byte[] { 0xe3 }); + + ansi_file.should_be_equal_to(string.Format("INSERT INTO [dbo].[timmy]([value]) VALUES('G{0}')", extended_ascii_character)); } } } diff --git a/product/roundhouse.tests/infrastructure/filesystem/utf16BE_with_bom.txt b/product/roundhouse.tests/infrastructure/filesystem/utf16BE_with_bom.txt new file mode 100644 index 00000000..f1c981c3 Binary files /dev/null and b/product/roundhouse.tests/infrastructure/filesystem/utf16BE_with_bom.txt differ diff --git a/product/roundhouse.tests/infrastructure/filesystem/utf16LE_with_bom.txt b/product/roundhouse.tests/infrastructure/filesystem/utf16LE_with_bom.txt new file mode 100644 index 00000000..2932c8e8 Binary files /dev/null and b/product/roundhouse.tests/infrastructure/filesystem/utf16LE_with_bom.txt differ diff --git a/product/roundhouse.tests/infrastructure/filesystem/utf32BE_with_bom.txt b/product/roundhouse.tests/infrastructure/filesystem/utf32BE_with_bom.txt new file mode 100644 index 00000000..38f11f2c Binary files /dev/null and b/product/roundhouse.tests/infrastructure/filesystem/utf32BE_with_bom.txt differ diff --git a/product/roundhouse.tests/infrastructure/filesystem/utf32LE_with_bom.txt b/product/roundhouse.tests/infrastructure/filesystem/utf32LE_with_bom.txt new file mode 100644 index 00000000..fd4acc76 Binary files /dev/null and b/product/roundhouse.tests/infrastructure/filesystem/utf32LE_with_bom.txt differ diff --git a/product/roundhouse.tests/infrastructure/filesystem/utf8_no_bom.txt b/product/roundhouse.tests/infrastructure/filesystem/utf8_no_bom.txt new file mode 100644 index 00000000..b0ed196e --- /dev/null +++ b/product/roundhouse.tests/infrastructure/filesystem/utf8_no_bom.txt @@ -0,0 +1 @@ +INSERT INTO [dbo].[timmy]([value]) VALUES('Gã') \ No newline at end of file diff --git a/product/roundhouse.tests/infrastructure/filesystem/utf8encoded.txt b/product/roundhouse.tests/infrastructure/filesystem/utf8_with_bom.txt similarity index 100% rename from product/roundhouse.tests/infrastructure/filesystem/utf8encoded.txt rename to product/roundhouse.tests/infrastructure/filesystem/utf8_with_bom.txt diff --git a/product/roundhouse.tests/infrastructure/filesystem/very_small_file.txt b/product/roundhouse.tests/infrastructure/filesystem/very_small_file.txt new file mode 100644 index 00000000..8d65f3c1 --- /dev/null +++ b/product/roundhouse.tests/infrastructure/filesystem/very_small_file.txt @@ -0,0 +1 @@ +GO \ No newline at end of file diff --git a/product/roundhouse.tests/roundhouse.tests.csproj b/product/roundhouse.tests/roundhouse.tests.csproj index 05a81406..dc3c9c8b 100644 --- a/product/roundhouse.tests/roundhouse.tests.csproj +++ b/product/roundhouse.tests/roundhouse.tests.csproj @@ -181,7 +181,25 @@ Always - + + Always + + + Always + + + Always + + + Always + + + Always + + + Always + + Always diff --git a/product/roundhouse/consoles/DefaultConfiguration.cs b/product/roundhouse/consoles/DefaultConfiguration.cs index 0de0ba01..e0d9b2de 100644 --- a/product/roundhouse/consoles/DefaultConfiguration.cs +++ b/product/roundhouse/consoles/DefaultConfiguration.cs @@ -70,6 +70,6 @@ public sealed class DefaultConfiguration : ConfigurationPropertyHolder public Dictionary UserTokens { get; set; } public bool Initialize { get; set; } public string ConfigurationFile { get; set; } - public System.Text.Encoding FileEncoding { get; set; } + public System.Text.Encoding DefaultEncoding { get; set; } } } \ No newline at end of file diff --git a/product/roundhouse/infrastructure.app/ConfigurationPropertyHolder.cs b/product/roundhouse/infrastructure.app/ConfigurationPropertyHolder.cs index 3a813c75..1c6aef00 100644 --- a/product/roundhouse/infrastructure.app/ConfigurationPropertyHolder.cs +++ b/product/roundhouse/infrastructure.app/ConfigurationPropertyHolder.cs @@ -1,5 +1,6 @@ using System.Collections.Generic; using roundhouse.infrastructure.logging; +// ReSharper disable InconsistentNaming namespace roundhouse.infrastructure.app { @@ -67,7 +68,7 @@ public interface ConfigurationPropertyHolder bool SearchAllSubdirectoriesInsteadOfTraverse { get; set; } bool DisableOutput { get; set; } Dictionary UserTokens { get; set; } - System.Text.Encoding FileEncoding { get; set; } + System.Text.Encoding DefaultEncoding { get; set; } string ConfigurationFile { get; set; } } } \ No newline at end of file diff --git a/product/roundhouse/infrastructure/filesystem/DotNetFileSystemAccess.cs b/product/roundhouse/infrastructure/filesystem/DotNetFileSystemAccess.cs index eb104799..d4062c22 100644 --- a/product/roundhouse/infrastructure/filesystem/DotNetFileSystemAccess.cs +++ b/product/roundhouse/infrastructure/filesystem/DotNetFileSystemAccess.cs @@ -69,32 +69,36 @@ public string read_file_text(string file_path) } /// - /// Takes a guess at the file encoding by looking to see if it has a BOM + /// Try to use BOM to detect encoding. If BOM is not present, use default from configuration or UTF-8 if configuration is empty. /// /// Path to the file name /// A best guess at the encoding of the file - /// http://www.west-wind.com/WebLog/posts/197245.aspx + /// https://stackoverflow.com/a/4522251/381282 public Encoding get_file_encoding(string file_path) { - if(configuration.FileEncoding != null) - { - return configuration.FileEncoding; - } - // *** Use Default of Encoding.Default (Ansi CodePage) - Encoding enc = Encoding.Default; + Encoding enc = configuration.DefaultEncoding ?? Encoding.UTF8; - // *** Detect byte order mark if any - otherwise assume default + // *** Detect byte order mark if any byte[] buffer = new byte[5]; - FileStream file = new FileStream(file_path, FileMode.Open, FileAccess.Read, FileShare.ReadWrite); - file.Read(buffer, 0, 5); - file.Close(); - + int bytes_read; + using (FileStream file = new FileStream(file_path, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) + { + bytes_read = file.Read(buffer, 0, 5); + } + if (buffer[0] == 0xef && buffer[1] == 0xbb && buffer[2] == 0xbf) enc = Encoding.UTF8; else if (buffer[0] == 0xfe && buffer[1] == 0xff) - enc = Encoding.Unicode; + enc = Encoding.BigEndianUnicode; // UTF-16 BE + if (buffer[0] == 0xff && buffer[1] == 0xfe) + { + if(bytes_read >= 4 && buffer[2] == 0 && buffer[3] == 0) + enc = Encoding.UTF32; // UTF-32 LE + else + enc = Encoding.Unicode; // UTF-16 LE + } else if (buffer[0] == 0 && buffer[1] == 0 && buffer[2] == 0xfe && buffer[3] == 0xff) - enc = Encoding.UTF32; + enc = Encoding.GetEncoding(12001); // UTF-32 BE else if (buffer[0] == 0x2b && buffer[1] == 0x2f && buffer[2] == 0x76) enc = Encoding.UTF7;