Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix string encoding issues #40

Merged
merged 9 commits into from
Feb 24, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Changes
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
Revision history for Perl module Neo4j::Bolt
0.4202 2021-02-24
- Fix rare string encoding issues (#38, #39)
0.4201 2021-01-20
- Common types for Neo4j objects between Bolt and Driver (Arne)
- Update (c)
Expand Down
3 changes: 1 addition & 2 deletions Makefile.PL
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ if ($neo_info) {
WriteMakefile(
NAME => 'Neo4j::Bolt',
VERSION_FROM => 'lib/Neo4j/Bolt.pm',
MIN_PERL_VERSION => '5.010',
MIN_PERL_VERSION => '5.010001',
CONFIGURE_REQUIRES => {
'Alien::OpenSSL' => 0,
'ExtUtils::MakeMaker' => '7.12',
Expand All @@ -67,7 +67,6 @@ WriteMakefile(
'Alien::OpenSSL' => 0,
'Neo4j::Client' => '0.46',
'Neo4j::Types' => '1.00',
'parent' => 0, # parent was first released with perl v5.10.1
'JSON::PP' => 0,
'URI' => 0,
'XSLoader' => '0.14', # XSLoader::load()
Expand Down
2 changes: 1 addition & 1 deletion lib/Neo4j/Bolt.pm
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ package Neo4j::Bolt;
use Cwd qw/realpath getcwd/;

BEGIN {
our $VERSION = "0.4201";
our $VERSION = "0.4202";
require Neo4j::Bolt::Cxn;
require Neo4j::Bolt::Txn;
require Neo4j::Bolt::ResultStream;
Expand Down
2 changes: 1 addition & 1 deletion lib/Neo4j/Bolt/CResultStream.pm
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
package
Neo4j::Bolt::CResultStream;
BEGIN {
our $VERSION = "0.4201";
our $VERSION = "0.4202";
require XSLoader;
XSLoader::load();
}
Expand Down
2 changes: 1 addition & 1 deletion lib/Neo4j/Bolt/CTypeHandlers.pm
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
package Neo4j::Bolt::CTypeHandlers;
BEGIN {
our $VERSION = "0.4201";
our $VERSION = "0.4202";
require XSLoader;
XSLoader::load();
}
Expand Down
32 changes: 22 additions & 10 deletions lib/Neo4j/Bolt/CTypeHandlers.xs
Original file line number Diff line number Diff line change
Expand Up @@ -72,11 +72,15 @@ neo4j_value_t SVnv_to_neo4j_float (SV *sv) {
neo4j_value_t SVpv_to_neo4j_string (SV *sv) {
STRLEN len;
char *k0,*k;
SV *sv2;
k = SvPV(sv,len);
// create duplicate to keep SvPVutf8 from changing the original SV
sv2 = newSVpvn_flags(k, len, SvFLAGS(sv) & SVf_UTF8 | SVs_TEMP);
k = SvPVutf8(sv2, len);
Newx(k0,len+1,char);
strncpy(k0,k,(size_t) len);
memcpy(k0,k,(size_t) len);
*(k0+len) = 0;
return neo4j_string(k0);
return neo4j_ustring(k0, len);
}

neo4j_value_t SV_to_neo4j_value(SV *sv) {
Expand Down Expand Up @@ -158,8 +162,9 @@ neo4j_value_t AV_to_neo4j_list(AV *av) {
neo4j_value_t HV_to_neo4j_map (HV *hv) {
HE *ent;
char *k,*k0;
SV *v;
int n,retlen;
SV *v,*ksv;
int n;
STRLEN retlen;
neo4j_map_entry_t *map_ents;
if (!HvTOTALKEYS(hv)) {
return neo4j_null;
Expand All @@ -168,9 +173,10 @@ neo4j_value_t HV_to_neo4j_map (HV *hv) {
hv_iterinit(hv);
n=0;
while ((ent = hv_iternext(hv))) {
k = hv_iterkey(ent,&retlen);
ksv = hv_iterkeysv(ent);
k = SvPVutf8(ksv, retlen);
Newx(k0,retlen+1,char);
strncpy(k0,k,retlen);
memcpy(k0,k,retlen);
*(k0+retlen)=0;
map_ents[n] = neo4j_map_entry( k0, SV_to_neo4j_value(hv_iterval(hv,ent)));
n++;
Expand Down Expand Up @@ -298,9 +304,11 @@ SV* neo4j_int_to_SViv( neo4j_value_t value ) {
}

SV* neo4j_string_to_SVpv( neo4j_value_t value ) {
STRLEN len;
SV* pv;
pv = newSVpv(neo4j_string_to_alloc_str(value), 0);
SvUTF8_on(pv); // depends on libneo4j-client output being valid UTF-8, always
len = neo4j_string_length(value);
pv = newSVpvn(neo4j_string_to_alloc_str(value), len);
sv_utf8_decode(pv);
return pv;
}

Expand Down Expand Up @@ -354,6 +362,7 @@ AV* neo4j_list_to_AV( neo4j_value_t value ) {

HV* neo4j_map_to_HV( neo4j_value_t value ) {
int i,n;
I32 klen;
char *ks;
const neo4j_map_entry_t *entry;
HV *hv;
Expand All @@ -365,8 +374,11 @@ HV* neo4j_map_to_HV( neo4j_value_t value ) {
ks = neo4j_string_to_alloc_str(entry->key);
sv = neo4j_value_to_SV(entry->value);
SvREFCNT_inc(sv);
if (hv_store(hv, ks, neo4j_string_length(entry->key), sv,0) ==
NULL) {
klen = neo4j_string_length(entry->key);
if (! is_utf8_invariant_string((U8 *)ks, (STRLEN)klen)) {
klen = -klen;
}
if (hv_store(hv, ks, klen, sv, 0) == NULL) {
SvREFCNT_dec(sv);
fprintf(stderr, "Failed to create hash entry for key '%s'\n",ks);
}
Expand Down
4 changes: 3 additions & 1 deletion lib/Neo4j/Bolt/Cxn.pm
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ package Neo4j::Bolt::Cxn;
use Carp qw/croak/;

BEGIN {
our $VERSION = "0.4201";
our $VERSION = "0.4202";
require Neo4j::Bolt::CTypeHandlers;
require Neo4j::Bolt::ResultStream;
require XSLoader;
Expand All @@ -27,6 +27,7 @@ sub run_query {
croak "Arg 2 should be a hashref of { param => $value, ... }";
}
croak "No connection" unless $self->connected;
utf8::upgrade($query);
return $self->run_query_($query, $parms // {}, 0, $db // default_db);
}

Expand All @@ -40,6 +41,7 @@ sub send_query {
croak "Arg 2 should be a hashref of { param => $value, ... }";
}
croak "No connection" unless $self->connected;
utf8::upgrade($query);
return $self->run_query_($query, $parms ? $parms : {}, 1, $db // default_db );
}

Expand Down
2 changes: 1 addition & 1 deletion lib/Neo4j/Bolt/NeoValue.pm
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
package Neo4j::Bolt::NeoValue;

BEGIN {
our $VERSION = "0.4201";
our $VERSION = "0.4202";
require Neo4j::Bolt::CTypeHandlers;
require Neo4j::Bolt::CResultStream;
require XSLoader;
Expand Down
2 changes: 1 addition & 1 deletion lib/Neo4j/Bolt/Node.pm
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
package Neo4j::Bolt::Node;
# ABSTRACT: Representation of Neo4j Node

$Neo4j::Bolt::Node::VERSION = '0.4201';
$Neo4j::Bolt::Node::VERSION = '0.4202';

use strict;
use warnings;
Expand Down
2 changes: 1 addition & 1 deletion lib/Neo4j/Bolt/Path.pm
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
package Neo4j::Bolt::Path;
# ABSTRACT: Representation of Neo4j Path

$Neo4j::Bolt::Path::VERSION = '0.4201';
$Neo4j::Bolt::Path::VERSION = '0.4202';

use strict;
use warnings;
Expand Down
2 changes: 1 addition & 1 deletion lib/Neo4j/Bolt/Relationship.pm
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
package Neo4j::Bolt::Relationship;
# ABSTRACT: Representation of Neo4j Relationship

$Neo4j::Bolt::Relationship::VERSION = '0.4201';
$Neo4j::Bolt::Relationship::VERSION = '0.4202';

use strict;
use warnings;
Expand Down
2 changes: 1 addition & 1 deletion lib/Neo4j/Bolt/ResultStream.pm
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ package Neo4j::Bolt::ResultStream;
# use Neo4j::Client;

BEGIN {
our $VERSION = "0.4201";
our $VERSION = "0.4202";
require Neo4j::Bolt::Cxn;
require Neo4j::Bolt::CResultStream;
require XSLoader;
Expand Down
4 changes: 3 additions & 1 deletion lib/Neo4j/Bolt/Txn.pm
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ package Neo4j::Bolt::Txn;
use Carp qw/croak/;

BEGIN {
our $VERSION = "0.4201";
our $VERSION = "0.4202";
require Neo4j::Bolt::CTypeHandlers;
require Neo4j::Bolt::ResultStream;
require XSLoader;
Expand Down Expand Up @@ -44,6 +44,7 @@ sub run_query {
if ($parms && !(ref $parms == 'HASH')) {
die "Arg 2 should be a hashref of { param => $value, ... }";
}
utf8::upgrade($query);
return $self->run_query_($query, $parms ? $parms : {}, 0);
}

Expand All @@ -56,6 +57,7 @@ sub send_query {
if ($parms && !(ref $parms == 'HASH')) {
die "Arg 2 should be a hashref of { param => $value, ... }";
}
utf8::upgrade($query);
return $self->run_query_($query, $parms // {}, 1);
}

Expand Down
160 changes: 160 additions & 0 deletions t/008_strings.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
use strict;
use warnings;
use Test::More;
use Encode ();
use File::Spec;
use URI;
use Neo4j::Bolt;
use Neo4j::Bolt::NeoValue;

# String encoding, esp. to and from UTF-8

plan tests => 7 + 14 + 18;

my ($i, $o, $v);

sub SVf_UTF8 { utf8::is_utf8(shift) ? 'U' : 'B' }
sub to_hex { join ' ', SVf_UTF8($_[0]), map { sprintf "%02x", ord $_ } split //, $_[0] }
sub to_str { my $s = shift; $s =~ s/([[:xdigit:]]{2})/chr(hex($1))/eg; Encode::_utf8_on($s); $s }

# GitHub issue #39
$i = "A\0B";
$v = Neo4j::Bolt::NeoValue->_new_from_perl($i);
is to_hex($v->_as_perl), to_hex($i), "NUL byte in string";

# GitHub issue #38

# strings with Perl byte semantics shouldn't be treated as Unicode strings
no utf8;
$i = "\x{c4}\x{80}"; # a sequence of two bytes that also happens to be valid UTF-8 for U+0100
eval { Encode::_utf8_off($i) }; # SVf_UTF8 should already be off, but why not try to make sure
$o = $i;
$v = Neo4j::Bolt::NeoValue->_new_from_perl($i);
$v = $v->_as_perl;
isnt to_hex($v), "U 100", "bytes in string";
ok utf8::is_utf8($v), "bytes in string - Neo4j returns UTF-8";

# same for map keys
# (which are strings as well, but use a different code path in CTypeHandlers)
$v = Neo4j::Bolt::NeoValue->_new_from_perl( {$i => 1} );
$v = ( keys %{$v->_as_perl} )[0];
isnt to_hex($v), "U 100", "bytes in map key";
ok utf8::is_utf8($v), "bytes in map key - Neo4j returns UTF-8";
is to_hex($i), to_hex($o), "input SV unchanged";

# Note: The exact result in the tests above depends on the native single
# byte encoding that Perl assumes for this file under "no utf8;". This
# encoding is usually Latin-1 (which would yield "U c4 80"), but could
# be anything else. However, we know "U 100" means that the non-Unicode
# bytes were interpreted as Unicode UTF-8, which is definitely wrong.

# real Unicode char in map key
use utf8;
$i = "\x{100}";
eval { Encode::_utf8_on($i) }; # SVf_UTF8 should already be on, but why not try to make sure
$v = Neo4j::Bolt::NeoValue->_new_from_perl( {$i => 1} );
$v = ( keys %{$v->_as_perl} )[0];
is to_hex($v), to_hex($i), "Unicode char in map key";

# byte sequences that aren't valid UTF-8 shouldn't be treated as such (RFC3629)
my @seq = (
40 => "B 40", # valid UTF-8 for U+0040
C2BD => "U bd", # valid UTF-8 for U+00BD
C1 => "B c1", # invalid byte
F5 => "B f5", # invalid byte
FF => "B ff", # invalid byte
A0 => "B a0", # unexpected continuation byte
5885 => "B 58 85", # unexpected continuation byte
D0D1 => "B d0 d1", # non-continuation byte before end of character
C838 => "B c8 38", # non-continuation byte before end of character
EEBB => "B ee bb", # ending before end of character
EDBFBF => "U dfff", # invalid UTF-8 code point, but allowed in Perl
F4908080 => "U 110000", # invalid UTF-8 code point, but allowed in Perl
F08282AC => "B f0 82 82 ac", # overlong encoding (for U+20AC)
C080 => "B c0 80", # overlong encoding (for U+0000)
);
for (my $k = 0; $k < @seq; ) {
$i = $seq[$k++];
$o = $seq[$k++];
$v = Neo4j::Bolt::NeoValue->_new_from_perl( to_str($i) );
is to_hex($v->_as_perl), $o, "byte sequence $i";
}

# Cypher statements
# (which are strings as well, but use a different code path in Cxn/Txn)

my $neo_info;
my $nif = File::Spec->catfile('t','neo_info');
if (-e $nif ) {
local $/;
open my $fh, "<", $nif or die $!;
my $val = <$fh>;
$val =~ s/^.*?(=.*)$/\$neo_info $1/s;
eval $val;
}

my $cxn;
if (defined $neo_info && $neo_info->{tests}) {
my $url = URI->new("bolt://$neo_info->{host}");
$url->userinfo("$neo_info->{user}:$neo_info->{pass}") if $neo_info->{user};
$cxn = Neo4j::Bolt->connect("$url");
}

SKIP: {
skip "statement tests require server connection", 18 unless $cxn && $cxn->connected;
my ($q, $id);

use utf8;
$i = "\x{100}";
$q = "RETURN '$i'";
eval { Encode::_utf8_on($q) }; # SVf_UTF8 should already be on...
$v = ($cxn->run_query($q)->fetch_next)[0];
is to_hex($v), to_hex($i), "Unicode char in Cxn run_query";
(undef, $v) = $cxn->do_query($q);
is to_hex($v->[0]), to_hex($i), "Unicode char in Cxn do_query";

no utf8;
$i = "\x{c4}\x{80}";
$q = "RETURN '$i'";
eval { Encode::_utf8_off($q) }; # SVf_UTF8 should already be off...
$v = ($cxn->run_query($q)->fetch_next)[0];
isnt to_hex($v), "U 100", "bytes in Cxn run_query - input not treated as UTF-8";
ok utf8::is_utf8($v), "bytes in Cxn run_query - output encoded in UTF-8";
(undef, $v) = $cxn->do_query($q);
isnt to_hex($v->[0]), "U 100", "bytes in Cxn do_query - input not treated as UTF-8";
ok utf8::is_utf8($v->[0]), "bytes in Cxn do_query - output encoded in UTF-8";

skip "transaction tests require Bolt version 3+", 12 if $cxn->protocol_version lt "3.0";
ok my $txn = Neo4j::Bolt::Txn->new($cxn), "begin transaction";

use utf8;
$i = "\x{100}";
eval { Encode::_utf8_on($i) }; # SVf_UTF8 should already be on...
(undef, $v) = $txn->do_query("CREATE (n) RETURN '$i', id(n)");
is to_hex($v->[0]), to_hex($i), "Unicode char in Txn do_query";
$id = {id => $v->[1]};
$v = $txn->send_query("MATCH (n) WHERE id(n) = \$id SET n.t = '$i'", $id);
ok $v->success(), "Txn send_query 1";
($v, $o) = $txn->run_query("MATCH (n) WHERE id(n) = \$id RETURN n.t, '$i'", $id)->fetch_next;
is to_hex($v), to_hex($i), "Unicode char in Txn send_query";
is to_hex($o), to_hex($i), "Unicode char in Txn run_query";

no utf8;
$i = "\x{c4}\x{80}";
eval { Encode::_utf8_off($i) }; # SVf_UTF8 should already be off...
(undef, $v) = $txn->do_query("CREATE (n) RETURN '$i', id(n)");
isnt to_hex($v->[0]), "U 100", "bytes in Txn do_query - input not treated as UTF-8";
ok utf8::is_utf8($v->[0]), "bytes in Txn do_query - output encoded in UTF-8";
$id = {id => $v->[1]};
$v = $txn->send_query("MATCH (n) WHERE id(n) = \$id SET n.t = '$i'", $id);
ok $v->success(), "Txn send_query 2";
($v, $o) = $txn->run_query("MATCH (n) WHERE id(n) = \$id RETURN n.t, '$i'", $id)->fetch_next;
isnt to_hex($v), "U 100", "bytes in Txn send_query - input not treated as UTF-8";
ok utf8::is_utf8($v), "bytes in Txn send_query - output encoded in UTF-8";
isnt to_hex($o), "U 100", "bytes in Txn run_query - input not treated as UTF-8";
ok utf8::is_utf8($o), "bytes in Txn run_query - output encoded in UTF-8";

$txn->rollback;
}

done_testing;