diff --git a/Changes b/Changes index 9ec22e3..ab138b2 100644 --- a/Changes +++ b/Changes @@ -1,4 +1,6 @@ Revision history for Perl module Neo4j::Bolt +0.4202 2021-02-24 + - Fix rare string encoding issues (#38, #39) 0.4201 2021-01-20 - Common types for Neo4j objects between Bolt and Driver (Arne) - Update (c) diff --git a/Makefile.PL b/Makefile.PL index c80a305..3b5e0e3 100644 --- a/Makefile.PL +++ b/Makefile.PL @@ -53,7 +53,7 @@ if ($neo_info) { WriteMakefile( NAME => 'Neo4j::Bolt', VERSION_FROM => 'lib/Neo4j/Bolt.pm', - MIN_PERL_VERSION => '5.010', + MIN_PERL_VERSION => '5.010001', CONFIGURE_REQUIRES => { 'Alien::OpenSSL' => 0, 'ExtUtils::MakeMaker' => '7.12', @@ -67,7 +67,6 @@ WriteMakefile( 'Alien::OpenSSL' => 0, 'Neo4j::Client' => '0.46', 'Neo4j::Types' => '1.00', - 'parent' => 0, # parent was first released with perl v5.10.1 'JSON::PP' => 0, 'URI' => 0, 'XSLoader' => '0.14', # XSLoader::load() diff --git a/lib/Neo4j/Bolt.pm b/lib/Neo4j/Bolt.pm index d77e6c7..b8bb4e3 100644 --- a/lib/Neo4j/Bolt.pm +++ b/lib/Neo4j/Bolt.pm @@ -2,7 +2,7 @@ package Neo4j::Bolt; use Cwd qw/realpath getcwd/; BEGIN { - our $VERSION = "0.4201"; + our $VERSION = "0.4202"; require Neo4j::Bolt::Cxn; require Neo4j::Bolt::Txn; require Neo4j::Bolt::ResultStream; diff --git a/lib/Neo4j/Bolt/CResultStream.pm b/lib/Neo4j/Bolt/CResultStream.pm index 08b2a97..31f8582 100644 --- a/lib/Neo4j/Bolt/CResultStream.pm +++ b/lib/Neo4j/Bolt/CResultStream.pm @@ -1,7 +1,7 @@ package Neo4j::Bolt::CResultStream; BEGIN { - our $VERSION = "0.4201"; + our $VERSION = "0.4202"; require XSLoader; XSLoader::load(); } diff --git a/lib/Neo4j/Bolt/CTypeHandlers.pm b/lib/Neo4j/Bolt/CTypeHandlers.pm index 2b95a83..ca9b8bb 100644 --- a/lib/Neo4j/Bolt/CTypeHandlers.pm +++ b/lib/Neo4j/Bolt/CTypeHandlers.pm @@ -1,6 +1,6 @@ package Neo4j::Bolt::CTypeHandlers; BEGIN { - our $VERSION = "0.4201"; + our $VERSION = "0.4202"; require XSLoader; XSLoader::load(); } diff --git a/lib/Neo4j/Bolt/CTypeHandlers.xs b/lib/Neo4j/Bolt/CTypeHandlers.xs index cc2f003..d102598 100644 --- a/lib/Neo4j/Bolt/CTypeHandlers.xs +++ b/lib/Neo4j/Bolt/CTypeHandlers.xs @@ -72,11 +72,15 @@ neo4j_value_t SVnv_to_neo4j_float (SV *sv) { neo4j_value_t SVpv_to_neo4j_string (SV *sv) { STRLEN len; char *k0,*k; + SV *sv2; k = SvPV(sv,len); + // create duplicate to keep SvPVutf8 from changing the original SV + sv2 = newSVpvn_flags(k, len, SvFLAGS(sv) & SVf_UTF8 | SVs_TEMP); + k = SvPVutf8(sv2, len); Newx(k0,len+1,char); - strncpy(k0,k,(size_t) len); + memcpy(k0,k,(size_t) len); *(k0+len) = 0; - return neo4j_string(k0); + return neo4j_ustring(k0, len); } neo4j_value_t SV_to_neo4j_value(SV *sv) { @@ -158,8 +162,9 @@ neo4j_value_t AV_to_neo4j_list(AV *av) { neo4j_value_t HV_to_neo4j_map (HV *hv) { HE *ent; char *k,*k0; - SV *v; - int n,retlen; + SV *v,*ksv; + int n; + STRLEN retlen; neo4j_map_entry_t *map_ents; if (!HvTOTALKEYS(hv)) { return neo4j_null; @@ -168,9 +173,10 @@ neo4j_value_t HV_to_neo4j_map (HV *hv) { hv_iterinit(hv); n=0; while ((ent = hv_iternext(hv))) { - k = hv_iterkey(ent,&retlen); + ksv = hv_iterkeysv(ent); + k = SvPVutf8(ksv, retlen); Newx(k0,retlen+1,char); - strncpy(k0,k,retlen); + memcpy(k0,k,retlen); *(k0+retlen)=0; map_ents[n] = neo4j_map_entry( k0, SV_to_neo4j_value(hv_iterval(hv,ent))); n++; @@ -298,9 +304,11 @@ SV* neo4j_int_to_SViv( neo4j_value_t value ) { } SV* neo4j_string_to_SVpv( neo4j_value_t value ) { + STRLEN len; SV* pv; - pv = newSVpv(neo4j_string_to_alloc_str(value), 0); - SvUTF8_on(pv); // depends on libneo4j-client output being valid UTF-8, always + len = neo4j_string_length(value); + pv = newSVpvn(neo4j_string_to_alloc_str(value), len); + sv_utf8_decode(pv); return pv; } @@ -354,6 +362,7 @@ AV* neo4j_list_to_AV( neo4j_value_t value ) { HV* neo4j_map_to_HV( neo4j_value_t value ) { int i,n; + I32 klen; char *ks; const neo4j_map_entry_t *entry; HV *hv; @@ -365,8 +374,11 @@ HV* neo4j_map_to_HV( neo4j_value_t value ) { ks = neo4j_string_to_alloc_str(entry->key); sv = neo4j_value_to_SV(entry->value); SvREFCNT_inc(sv); - if (hv_store(hv, ks, neo4j_string_length(entry->key), sv,0) == - NULL) { + klen = neo4j_string_length(entry->key); + if (! is_utf8_invariant_string((U8 *)ks, (STRLEN)klen)) { + klen = -klen; + } + if (hv_store(hv, ks, klen, sv, 0) == NULL) { SvREFCNT_dec(sv); fprintf(stderr, "Failed to create hash entry for key '%s'\n",ks); } diff --git a/lib/Neo4j/Bolt/Cxn.pm b/lib/Neo4j/Bolt/Cxn.pm index f4ea05b..b2977f2 100644 --- a/lib/Neo4j/Bolt/Cxn.pm +++ b/lib/Neo4j/Bolt/Cxn.pm @@ -2,7 +2,7 @@ package Neo4j::Bolt::Cxn; use Carp qw/croak/; BEGIN { - our $VERSION = "0.4201"; + our $VERSION = "0.4202"; require Neo4j::Bolt::CTypeHandlers; require Neo4j::Bolt::ResultStream; require XSLoader; @@ -27,6 +27,7 @@ sub run_query { croak "Arg 2 should be a hashref of { param => $value, ... }"; } croak "No connection" unless $self->connected; + utf8::upgrade($query); return $self->run_query_($query, $parms // {}, 0, $db // default_db); } @@ -40,6 +41,7 @@ sub send_query { croak "Arg 2 should be a hashref of { param => $value, ... }"; } croak "No connection" unless $self->connected; + utf8::upgrade($query); return $self->run_query_($query, $parms ? $parms : {}, 1, $db // default_db ); } diff --git a/lib/Neo4j/Bolt/NeoValue.pm b/lib/Neo4j/Bolt/NeoValue.pm index 4b30468..b218cba 100644 --- a/lib/Neo4j/Bolt/NeoValue.pm +++ b/lib/Neo4j/Bolt/NeoValue.pm @@ -1,7 +1,7 @@ package Neo4j::Bolt::NeoValue; BEGIN { - our $VERSION = "0.4201"; + our $VERSION = "0.4202"; require Neo4j::Bolt::CTypeHandlers; require Neo4j::Bolt::CResultStream; require XSLoader; diff --git a/lib/Neo4j/Bolt/Node.pm b/lib/Neo4j/Bolt/Node.pm index 4b84e54..a6a2881 100644 --- a/lib/Neo4j/Bolt/Node.pm +++ b/lib/Neo4j/Bolt/Node.pm @@ -1,7 +1,7 @@ package Neo4j::Bolt::Node; # ABSTRACT: Representation of Neo4j Node -$Neo4j::Bolt::Node::VERSION = '0.4201'; +$Neo4j::Bolt::Node::VERSION = '0.4202'; use strict; use warnings; diff --git a/lib/Neo4j/Bolt/Path.pm b/lib/Neo4j/Bolt/Path.pm index 2abfa53..9263f90 100644 --- a/lib/Neo4j/Bolt/Path.pm +++ b/lib/Neo4j/Bolt/Path.pm @@ -1,7 +1,7 @@ package Neo4j::Bolt::Path; # ABSTRACT: Representation of Neo4j Path -$Neo4j::Bolt::Path::VERSION = '0.4201'; +$Neo4j::Bolt::Path::VERSION = '0.4202'; use strict; use warnings; diff --git a/lib/Neo4j/Bolt/Relationship.pm b/lib/Neo4j/Bolt/Relationship.pm index ee5b533..e883129 100644 --- a/lib/Neo4j/Bolt/Relationship.pm +++ b/lib/Neo4j/Bolt/Relationship.pm @@ -1,7 +1,7 @@ package Neo4j::Bolt::Relationship; # ABSTRACT: Representation of Neo4j Relationship -$Neo4j::Bolt::Relationship::VERSION = '0.4201'; +$Neo4j::Bolt::Relationship::VERSION = '0.4202'; use strict; use warnings; diff --git a/lib/Neo4j/Bolt/ResultStream.pm b/lib/Neo4j/Bolt/ResultStream.pm index f35edf3..783e0ed 100644 --- a/lib/Neo4j/Bolt/ResultStream.pm +++ b/lib/Neo4j/Bolt/ResultStream.pm @@ -2,7 +2,7 @@ package Neo4j::Bolt::ResultStream; # use Neo4j::Client; BEGIN { - our $VERSION = "0.4201"; + our $VERSION = "0.4202"; require Neo4j::Bolt::Cxn; require Neo4j::Bolt::CResultStream; require XSLoader; diff --git a/lib/Neo4j/Bolt/Txn.pm b/lib/Neo4j/Bolt/Txn.pm index b03c7ec..0eeb24e 100644 --- a/lib/Neo4j/Bolt/Txn.pm +++ b/lib/Neo4j/Bolt/Txn.pm @@ -2,7 +2,7 @@ package Neo4j::Bolt::Txn; use Carp qw/croak/; BEGIN { - our $VERSION = "0.4201"; + our $VERSION = "0.4202"; require Neo4j::Bolt::CTypeHandlers; require Neo4j::Bolt::ResultStream; require XSLoader; @@ -44,6 +44,7 @@ sub run_query { if ($parms && !(ref $parms == 'HASH')) { die "Arg 2 should be a hashref of { param => $value, ... }"; } + utf8::upgrade($query); return $self->run_query_($query, $parms ? $parms : {}, 0); } @@ -56,6 +57,7 @@ sub send_query { if ($parms && !(ref $parms == 'HASH')) { die "Arg 2 should be a hashref of { param => $value, ... }"; } + utf8::upgrade($query); return $self->run_query_($query, $parms // {}, 1); } diff --git a/t/008_strings.t b/t/008_strings.t new file mode 100644 index 0000000..28e2f53 --- /dev/null +++ b/t/008_strings.t @@ -0,0 +1,160 @@ +use strict; +use warnings; +use Test::More; +use Encode (); +use File::Spec; +use URI; +use Neo4j::Bolt; +use Neo4j::Bolt::NeoValue; + +# String encoding, esp. to and from UTF-8 + +plan tests => 7 + 14 + 18; + +my ($i, $o, $v); + +sub SVf_UTF8 { utf8::is_utf8(shift) ? 'U' : 'B' } +sub to_hex { join ' ', SVf_UTF8($_[0]), map { sprintf "%02x", ord $_ } split //, $_[0] } +sub to_str { my $s = shift; $s =~ s/([[:xdigit:]]{2})/chr(hex($1))/eg; Encode::_utf8_on($s); $s } + +# GitHub issue #39 +$i = "A\0B"; +$v = Neo4j::Bolt::NeoValue->_new_from_perl($i); +is to_hex($v->_as_perl), to_hex($i), "NUL byte in string"; + +# GitHub issue #38 + +# strings with Perl byte semantics shouldn't be treated as Unicode strings +no utf8; +$i = "\x{c4}\x{80}"; # a sequence of two bytes that also happens to be valid UTF-8 for U+0100 +eval { Encode::_utf8_off($i) }; # SVf_UTF8 should already be off, but why not try to make sure +$o = $i; +$v = Neo4j::Bolt::NeoValue->_new_from_perl($i); +$v = $v->_as_perl; +isnt to_hex($v), "U 100", "bytes in string"; +ok utf8::is_utf8($v), "bytes in string - Neo4j returns UTF-8"; + +# same for map keys +# (which are strings as well, but use a different code path in CTypeHandlers) +$v = Neo4j::Bolt::NeoValue->_new_from_perl( {$i => 1} ); +$v = ( keys %{$v->_as_perl} )[0]; +isnt to_hex($v), "U 100", "bytes in map key"; +ok utf8::is_utf8($v), "bytes in map key - Neo4j returns UTF-8"; +is to_hex($i), to_hex($o), "input SV unchanged"; + +# Note: The exact result in the tests above depends on the native single +# byte encoding that Perl assumes for this file under "no utf8;". This +# encoding is usually Latin-1 (which would yield "U c4 80"), but could +# be anything else. However, we know "U 100" means that the non-Unicode +# bytes were interpreted as Unicode UTF-8, which is definitely wrong. + +# real Unicode char in map key +use utf8; +$i = "\x{100}"; +eval { Encode::_utf8_on($i) }; # SVf_UTF8 should already be on, but why not try to make sure +$v = Neo4j::Bolt::NeoValue->_new_from_perl( {$i => 1} ); +$v = ( keys %{$v->_as_perl} )[0]; +is to_hex($v), to_hex($i), "Unicode char in map key"; + +# byte sequences that aren't valid UTF-8 shouldn't be treated as such (RFC3629) +my @seq = ( + 40 => "B 40", # valid UTF-8 for U+0040 + C2BD => "U bd", # valid UTF-8 for U+00BD + C1 => "B c1", # invalid byte + F5 => "B f5", # invalid byte + FF => "B ff", # invalid byte + A0 => "B a0", # unexpected continuation byte + 5885 => "B 58 85", # unexpected continuation byte + D0D1 => "B d0 d1", # non-continuation byte before end of character + C838 => "B c8 38", # non-continuation byte before end of character + EEBB => "B ee bb", # ending before end of character + EDBFBF => "U dfff", # invalid UTF-8 code point, but allowed in Perl + F4908080 => "U 110000", # invalid UTF-8 code point, but allowed in Perl + F08282AC => "B f0 82 82 ac", # overlong encoding (for U+20AC) + C080 => "B c0 80", # overlong encoding (for U+0000) +); +for (my $k = 0; $k < @seq; ) { + $i = $seq[$k++]; + $o = $seq[$k++]; + $v = Neo4j::Bolt::NeoValue->_new_from_perl( to_str($i) ); + is to_hex($v->_as_perl), $o, "byte sequence $i"; +} + +# Cypher statements +# (which are strings as well, but use a different code path in Cxn/Txn) + +my $neo_info; +my $nif = File::Spec->catfile('t','neo_info'); +if (-e $nif ) { + local $/; + open my $fh, "<", $nif or die $!; + my $val = <$fh>; + $val =~ s/^.*?(=.*)$/\$neo_info $1/s; + eval $val; +} + +my $cxn; +if (defined $neo_info && $neo_info->{tests}) { + my $url = URI->new("bolt://$neo_info->{host}"); + $url->userinfo("$neo_info->{user}:$neo_info->{pass}") if $neo_info->{user}; + $cxn = Neo4j::Bolt->connect("$url"); +} + +SKIP: { + skip "statement tests require server connection", 18 unless $cxn && $cxn->connected; + my ($q, $id); + + use utf8; + $i = "\x{100}"; + $q = "RETURN '$i'"; + eval { Encode::_utf8_on($q) }; # SVf_UTF8 should already be on... + $v = ($cxn->run_query($q)->fetch_next)[0]; + is to_hex($v), to_hex($i), "Unicode char in Cxn run_query"; + (undef, $v) = $cxn->do_query($q); + is to_hex($v->[0]), to_hex($i), "Unicode char in Cxn do_query"; + + no utf8; + $i = "\x{c4}\x{80}"; + $q = "RETURN '$i'"; + eval { Encode::_utf8_off($q) }; # SVf_UTF8 should already be off... + $v = ($cxn->run_query($q)->fetch_next)[0]; + isnt to_hex($v), "U 100", "bytes in Cxn run_query - input not treated as UTF-8"; + ok utf8::is_utf8($v), "bytes in Cxn run_query - output encoded in UTF-8"; + (undef, $v) = $cxn->do_query($q); + isnt to_hex($v->[0]), "U 100", "bytes in Cxn do_query - input not treated as UTF-8"; + ok utf8::is_utf8($v->[0]), "bytes in Cxn do_query - output encoded in UTF-8"; + + skip "transaction tests require Bolt version 3+", 12 if $cxn->protocol_version lt "3.0"; + ok my $txn = Neo4j::Bolt::Txn->new($cxn), "begin transaction"; + + use utf8; + $i = "\x{100}"; + eval { Encode::_utf8_on($i) }; # SVf_UTF8 should already be on... + (undef, $v) = $txn->do_query("CREATE (n) RETURN '$i', id(n)"); + is to_hex($v->[0]), to_hex($i), "Unicode char in Txn do_query"; + $id = {id => $v->[1]}; + $v = $txn->send_query("MATCH (n) WHERE id(n) = \$id SET n.t = '$i'", $id); + ok $v->success(), "Txn send_query 1"; + ($v, $o) = $txn->run_query("MATCH (n) WHERE id(n) = \$id RETURN n.t, '$i'", $id)->fetch_next; + is to_hex($v), to_hex($i), "Unicode char in Txn send_query"; + is to_hex($o), to_hex($i), "Unicode char in Txn run_query"; + + no utf8; + $i = "\x{c4}\x{80}"; + eval { Encode::_utf8_off($i) }; # SVf_UTF8 should already be off... + (undef, $v) = $txn->do_query("CREATE (n) RETURN '$i', id(n)"); + isnt to_hex($v->[0]), "U 100", "bytes in Txn do_query - input not treated as UTF-8"; + ok utf8::is_utf8($v->[0]), "bytes in Txn do_query - output encoded in UTF-8"; + $id = {id => $v->[1]}; + $v = $txn->send_query("MATCH (n) WHERE id(n) = \$id SET n.t = '$i'", $id); + ok $v->success(), "Txn send_query 2"; + ($v, $o) = $txn->run_query("MATCH (n) WHERE id(n) = \$id RETURN n.t, '$i'", $id)->fetch_next; + isnt to_hex($v), "U 100", "bytes in Txn send_query - input not treated as UTF-8"; + ok utf8::is_utf8($v), "bytes in Txn send_query - output encoded in UTF-8"; + isnt to_hex($o), "U 100", "bytes in Txn run_query - input not treated as UTF-8"; + ok utf8::is_utf8($o), "bytes in Txn run_query - output encoded in UTF-8"; + + $txn->rollback; +} + +done_testing;