Skip to content

Commit

Permalink
Changed "fileencoding" parameter to "defaultencoding" and its behavio…
Browse files Browse the repository at this point in the history
…r to detect BOM 1st before using encoding from configuration

- default encoding (if not provided in configuration) changed from Encoding.Default (ansi code page configured in OS) to UTF-8
- fixed incorrect BOM detection of LE\BE variants of UTF-16 and UTF-32
  • Loading branch information
Liwoj committed Oct 16, 2017
1 parent 249aabb commit 92126f2
Show file tree
Hide file tree
Showing 14 changed files with 144 additions and 41 deletions.
10 changes: 6 additions & 4 deletions product/roundhouse.console/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -346,15 +346,16 @@ private static void parse_arguments_and_set_up_configuration(ConfigurationProper
"SearchAllSubdirectoriesInsteadOfTraverse - Each Migration folder's subdirectories are traversed by default. This option pulls back scripts from the main directory and all subdirectories at once. Defaults to 'false'",
option => configuration.SearchAllSubdirectoriesInsteadOfTraverse = option != null)
.Add("isuptodate",
"This option prints whether there are any database updates or not, whithout actually running them. Other output except errors is disabled, to make it easy to use in scripts.",
"This option prints whether there are any database updates or not, without actually running them. Other output except errors is disabled, to make it easy to use in scripts.",
option => { })
.Add("fileencoding=",
"File Encoding - Explicitly specify the file encoding of all files",
// default encoding
.Add("defaultencoding=",
"Default encoding to use for loading script file from disk if file doesn't contain BOM. For the list of possible values see the column Name in table listed in .NET Encoding class documentation. Defaults to UTF-8",
option =>
{
if(option != null)
{
configuration.FileEncoding = System.Text.Encoding.GetEncoding(option);
configuration.DefaultEncoding = System.Text.Encoding.GetEncoding(option);
}
})
//load configuration from file
Expand Down Expand Up @@ -409,6 +410,7 @@ private static void parse_arguments_and_set_up_configuration(ConfigurationProper
"/dryrun " +
"/search[allsubdirectories]insteadoftraverse " +
"/isuptodate" +
"/defaultencoding VALUE" +
"]", Environment.NewLine);
show_help(usage_message, option_set);
}
Expand Down
2 changes: 1 addition & 1 deletion product/roundhouse.tasks/Roundhouse.cs
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ public string RecoveryMode

public bool SearchAllSubdirectoriesInsteadOfTraverse { get; set; }

public System.Text.Encoding FileEncoding { get; set; }
public System.Text.Encoding DefaultEncoding { get; set; }

public bool DisableOutput { get; set; }

Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
using System;
using System.IO;
using NUnit.Framework;

Expand All @@ -11,45 +12,120 @@ public class DotNetFileSystemAccessSpecs
{
public abstract class concern_for_file_system : TinySpec<DotNetFileSystemAccess>
{
protected object result;
private DotNetFileSystemAccess dot_net_file_system_access = new DotNetFileSystemAccess(new DefaultConfiguration());
protected DotNetFileSystemAccess dot_net_file_system_access;

protected string utf_expected_string;

protected string very_small_file;
protected string utf8_file_with_bom;
protected string utf8_file_no_bom;
protected string utf16LE_file_with_bom;
protected string utf16BE_file_with_bom;
protected string utf32LE_file_with_bom;
protected string utf32BE_file_with_bom;
protected string ansi_file;

public override void Context() { }
protected override DotNetFileSystemAccess sut
{
get { return dot_net_file_system_access; }
set { dot_net_file_system_access = value; }
}
}

[Concern(typeof(DotNetFileSystemAccess))]
public class when_reading_files_with_different_formats : concern_for_file_system
{
protected static string utf8_file;
protected static string ansi_file;
private string read_test_file(string file_name)
{
return sut.read_file_text(Path.Combine(TestContext.CurrentContext.TestDirectory, string.Format(@"infrastructure\filesystem\{0}", file_name)));
}

public override void Because()
{
utf8_file = sut.read_file_text(Path.Combine(TestContext.CurrentContext.TestDirectory, @"infrastructure\filesystem\utf8encoded.txt"));
ansi_file = sut.read_file_text(Path.Combine(TestContext.CurrentContext.TestDirectory, @"infrastructure\filesystem\ansiencoded.txt"));
// NOTE: When testing encodings, its important to do it so the result doesnt depend on how the compiler itself will read the source file.
// Best way to do it is to work with byte representation i.e. do not include any non ASCII characters (> 127) in source code directly...
string unicode_character = Encoding.UTF8.GetString(new byte[] { 0xc3, 0xa3 }); // small "a" with tilde (utf-8)
utf_expected_string = string.Format("INSERT INTO [dbo].[timmy]([value]) VALUES('G{0}')", unicode_character);

very_small_file = read_test_file("very_small_file.txt");
utf8_file_with_bom = read_test_file("utf8_with_bom.txt");
utf8_file_no_bom = read_test_file("utf8_no_bom.txt");
utf16LE_file_with_bom = read_test_file("utf16LE_with_bom.txt");
utf16BE_file_with_bom = read_test_file("utf16BE_with_bom.txt");
utf32LE_file_with_bom = read_test_file("utf32LE_with_bom.txt");
utf32BE_file_with_bom = read_test_file("utf32BE_with_bom.txt");

ansi_file = read_test_file("ansiencoded.txt");
}

[Observation]
public void very_small_file_should_read_correctly()
{
very_small_file.should_be_equal_to("GO");
}

[Observation]
public void utf8_encoded_file_with_bom_should_read_correctly()
{
utf8_file_with_bom.should_be_equal_to(utf_expected_string);
}

[Observation]
public void utf16LE_encoded_file_with_bom_should_read_correctly()
{
utf16LE_file_with_bom.should_be_equal_to(utf_expected_string);
}

[Observation]
public void utf16BE_encoded_file_with_bom_should_read_correctly()
{
utf16BE_file_with_bom.should_be_equal_to(utf_expected_string);
}

[Observation]
public void utf8_encoded_file_should_read_correctly()
public void utf32LE_encoded_file_with_bom_should_read_correctly()
{
utf32LE_file_with_bom.should_be_equal_to(utf_expected_string);
}

[Observation]
public void utf32BE_encoded_file_with_bom_should_read_correctly()
{
utf32BE_file_with_bom.should_be_equal_to(utf_expected_string);
}
}


[Concern(typeof(DotNetFileSystemAccess))]
public class when_reading_files_with_different_formats_with_default_configuration : concern_for_file_system
{
public override void Context()
{
utf8_file.should_be_equal_to("INSERT INTO [dbo].[timmy]([value]) VALUES('Gã')");
dot_net_file_system_access = new DotNetFileSystemAccess(new DefaultConfiguration());
}

[Observation]
public void ansi_encoded_file_should_read_correctly()
public void utf8_encoded_file_without_bom_should_read_correctly()
{
ansi_file.should_be_equal_to(to_default_code_page("INSERT INTO [dbo].[timmy]([value]) VALUES('Gã')"));
utf8_file_no_bom.should_be_equal_to(utf_expected_string);
}
}

[Concern(typeof(DotNetFileSystemAccess))]
public class when_reading_files_with_different_formats_with_ansi_encoding_configuration : concern_for_file_system
{
private readonly Encoding ansi_encoding = Encoding.GetEncoding("windows-1252");

private string to_default_code_page(string english_code_page_text)
public override void Context()
{
var bytes = Encoding.GetEncoding(1252).GetBytes(english_code_page_text);
return Encoding.Default.GetString(bytes);
dot_net_file_system_access = new DotNetFileSystemAccess(
new DefaultConfiguration { DefaultEncoding = ansi_encoding }
);
}

[Observation]
public void ansi_encoded_file_shoul_read_correctly()
{
// small "a" with tilde in Windows-1252
string extended_ascii_character = ansi_encoding.GetString(new byte[] { 0xe3 });

ansi_file.should_be_equal_to(string.Format("INSERT INTO [dbo].[timmy]([value]) VALUES('G{0}')", extended_ascii_character));
}
}
}
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
INSERT INTO [dbo].[timmy]([value]) VALUES('Gã')
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
GO
20 changes: 19 additions & 1 deletion product/roundhouse.tests/roundhouse.tests.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,25 @@
<Content Include="infrastructure\filesystem\ansiencoded.txt">
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
</Content>
<Content Include="infrastructure\filesystem\utf8encoded.txt">
<Content Include="infrastructure\filesystem\utf16BE_with_bom.txt">
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
</Content>
<Content Include="infrastructure\filesystem\utf16LE_with_bom.txt">
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
</Content>
<Content Include="infrastructure\filesystem\utf32BE_with_bom.txt">
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
</Content>
<Content Include="infrastructure\filesystem\utf32LE_with_bom.txt">
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
</Content>
<Content Include="infrastructure\filesystem\utf8_no_bom.txt">
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
</Content>
<Content Include="infrastructure\filesystem\utf8_with_bom.txt">
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
</Content>
<Content Include="infrastructure\filesystem\very_small_file.txt">
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
</Content>
</ItemGroup>
Expand Down
2 changes: 1 addition & 1 deletion product/roundhouse/consoles/DefaultConfiguration.cs
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,6 @@ public sealed class DefaultConfiguration : ConfigurationPropertyHolder
public bool DisableOutput { get; set; }
public bool Initialize { get; set; }
public string ConfigurationFile { get; set; }
public System.Text.Encoding FileEncoding { get; set; }
public System.Text.Encoding DefaultEncoding { get; set; }
}
}
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using roundhouse.infrastructure.logging;
// ReSharper disable InconsistentNaming

namespace roundhouse.infrastructure.app
{
Expand Down Expand Up @@ -65,7 +66,7 @@ public interface ConfigurationPropertyHolder
bool DisableTokenReplacement { get; set; }
bool SearchAllSubdirectoriesInsteadOfTraverse { get; set; }
bool DisableOutput { get; set; }
System.Text.Encoding FileEncoding { get; set; }
System.Text.Encoding DefaultEncoding { get; set; }
string ConfigurationFile { get; set; }
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -69,32 +69,36 @@ public string read_file_text(string file_path)
}

/// <summary>
/// Takes a guess at the file encoding by looking to see if it has a BOM
/// Try to use BOM to detect encoding. If BOM is not present, use default from configuration or UTF-8 if configuration is empty.
/// </summary>
/// <param name="file_path">Path to the file name</param>
/// <returns>A best guess at the encoding of the file</returns>
/// <remarks>http://www.west-wind.com/WebLog/posts/197245.aspx</remarks>
/// <remarks>https://stackoverflow.com/a/4522251/381282</remarks>
public Encoding get_file_encoding(string file_path)
{
if(configuration.FileEncoding != null)
{
return configuration.FileEncoding;
}
// *** Use Default of Encoding.Default (Ansi CodePage)
Encoding enc = Encoding.Default;
Encoding enc = configuration.DefaultEncoding ?? Encoding.UTF8;

// *** Detect byte order mark if any - otherwise assume default
// *** Detect byte order mark if any
byte[] buffer = new byte[5];
FileStream file = new FileStream(file_path, FileMode.Open, FileAccess.Read, FileShare.ReadWrite);
file.Read(buffer, 0, 5);
file.Close();

int bytes_read;
using (FileStream file = new FileStream(file_path, FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
{
bytes_read = file.Read(buffer, 0, 5);
}

if (buffer[0] == 0xef && buffer[1] == 0xbb && buffer[2] == 0xbf)
enc = Encoding.UTF8;
else if (buffer[0] == 0xfe && buffer[1] == 0xff)
enc = Encoding.Unicode;
enc = Encoding.BigEndianUnicode; // UTF-16 BE
if (buffer[0] == 0xff && buffer[1] == 0xfe)
{
if(bytes_read >= 4 && buffer[2] == 0 && buffer[3] == 0)
enc = Encoding.UTF32; // UTF-32 LE
else
enc = Encoding.Unicode; // UTF-16 LE
}
else if (buffer[0] == 0 && buffer[1] == 0 && buffer[2] == 0xfe && buffer[3] == 0xff)
enc = Encoding.UTF32;
enc = Encoding.GetEncoding(12001); // UTF-32 BE
else if (buffer[0] == 0x2b && buffer[1] == 0x2f && buffer[2] == 0x76)
enc = Encoding.UTF7;

Expand Down

0 comments on commit 92126f2

Please sign in to comment.