Skip to content

Commit

Permalink
UOE-11453: handling whitespaces in fastxml (#965)
Browse files Browse the repository at this point in the history
  • Loading branch information
pm-viral-vala authored Nov 26, 2024
1 parent ee1ada2 commit 0cef376
Show file tree
Hide file tree
Showing 8 changed files with 140 additions and 107 deletions.
File renamed without changes.
Empty file.
3 changes: 2 additions & 1 deletion endpoints/events/test/raw_vast.txt

Large diffs are not rendered by default.

32 changes: 19 additions & 13 deletions endpoints/events/vtrack_ow.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"bytes"
"encoding/base64"
"errors"
"regexp"
"strings"
"time"

Expand All @@ -16,6 +17,7 @@ import (

var (
errEventURLNotConfigured = errors.New("event urls not configured")
tmpWSRemoverRegex = regexp.MustCompile(`>\s+<`)
)

// InjectVideoEventTrackers injects the video tracking events
Expand Down Expand Up @@ -52,16 +54,18 @@ func InjectVideoEventTrackers(
etreeParserTime := time.Since(_startTime)

if fastXMLExperiment && err == nil {
etreeXMLResponse := response

_startTime = time.Now()
fastXMLResponse, _ := injectVideoEventsFastXML(vastXML, eventURLMap, nurlPresent, imp.Video.Linearity)
fastXMLParserTime := time.Since(_startTime)

//temporary
if fastXMLResponse != vastXML {
fastXMLResponse = tmpFastXMLProcessing(fastXMLResponse)
fastXMLResponse, etreeXMLResponse = tmpFastXMLProcessing(fastXMLResponse, response)
}

isResponseMismatch := (response != fastXMLResponse)
isResponseMismatch := (etreeXMLResponse != fastXMLResponse)

if isResponseMismatch {
openrtb_ext.FastXMLLogf("\n[XML_PARSER_TEST] method:[vcr] creative:[%s]", base64.StdEncoding.EncodeToString([]byte(vastXML)))
Expand All @@ -77,17 +81,6 @@ func InjectVideoEventTrackers(
return response, metrics, err
}

func tmpFastXMLProcessing(vast string) string {
//replace only if trackers are injected
vast = strings.ReplaceAll(vast, " >", ">")
// if strings.Contains(vast, "'") {
// if index := strings.Index(vast, "<VAST"); index != -1 {
// vast = vast[0:index] + strings.ReplaceAll(vast[index:], "'", "\"")
// }
// }
return vast
}

func injectVideoEventsETree(vastXML string, eventURLMap map[string]string, nurlPresent bool, linearity adcom1.LinearityMode) (string, error) {

// parse VAST
Expand Down Expand Up @@ -236,3 +229,16 @@ func FindCreatives(doc *etree.Document) []*etree.Element {
creatives = append(creatives, doc.FindElements("VAST/Ad/Wrapper/Creatives/Creative/NonLinearAds")...)
return creatives
}

func tmpFastXMLProcessing(fastXML, etreeXML string) (string, string) {
//replace only if trackers are injected
fastXML = strings.TrimSpace(fastXML) //step1: remove heading and trailing whitespaces
fastXML = tmpWSRemoverRegex.ReplaceAllString(fastXML, "><") //step2: remove inbetween whitespaces
fastXML = strings.ReplaceAll(fastXML, " ><", "><") //step3: remove attribute endtag whitespace (this should be always before step2)
fastXML = strings.ReplaceAll(fastXML, "'", "\"") //step4: convert single quote to double quote

etreeXML = tmpWSRemoverRegex.ReplaceAllString(etreeXML, "><") //step2: remove inbetween whitespaces
etreeXML = strings.ReplaceAll(etreeXML, " ><", "><") //step3: remove attribute endtag whitespace (this should be always before step2)
etreeXML = strings.ReplaceAll(etreeXML, "'", "\"")
return fastXML, etreeXML
}
192 changes: 110 additions & 82 deletions endpoints/events/vtrack_ow_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,36 @@ import (
"strings"
"testing"

"github.com/beevik/etree"
"github.com/prebid/openrtb/v20/adcom1"
"github.com/prebid/openrtb/v20/openrtb2"
"github.com/prebid/prebid-server/v2/openrtb_ext"
"github.com/stretchr/testify/assert"
)

func search(arr []int, value int) bool {
idx := sort.SearchInts(arr, value)
return idx < len(arr) && arr[idx] == value
}

func quoteUnescape[T []byte | string](s T) string {
buf := bytes.Buffer{}
for i := 0; i < len(s); i++ {
ch := s[i]
if ch == '\\' {
if i+1 < len(s) {
nextCh := s[i+1]
if nextCh == '\\' || nextCh == '"' || nextCh == '\'' {
i++
ch = nextCh
}
}
}
buf.WriteByte(ch)
}
return buf.String()
}

func TestInjectVideoEventTrackers(t *testing.T) {
type args struct {
externalURL string
Expand Down Expand Up @@ -431,115 +455,119 @@ func TestInjectVideoEventTrackers(t *testing.T) {
}
}

func quoteUnescape[T []byte | string](s T) string {
buf := bytes.Buffer{}
for i := 0; i < len(s); i++ {
ch := s[i]
if ch == '\\' {
if i+1 < len(s) {
nextCh := s[i+1]
if nextCh == '\\' || nextCh == '"' || nextCh == '\'' {
i++
ch = nextCh
}
}
}
buf.WriteByte(ch)
func TestETreeBehaviour(t *testing.T) {
// vast1 := `<?xml version="1.0" encoding="UTF-8" standalone="no"?><VAST version="2.0"><Ad id="4650_86226f7b2a982e9cadfd8dc58d6965d0"><InLine><AdSystem version="1.0.0">Appreciate</AdSystem><Impression><![CDATA[https://ets-us-east-1.track.smaato.net/v1/view?sessionId=5251ca0e-bbf6-0e29-ae23-b7f26a5afde1&adSourceId=3b574e75-bf30-58a8-dd58-e1150fc75c7a&originalRequestTime=1728487967713&e=prebid&c=ortb26&expires=1728489767713&dpid=XXf9QjPbrtRrwIB0Nwyjfg%3D%3D%7CN7ChHoSOYimw_5CVEwXUIA%3D%3D&winurl=9YmCuFWdQfG4XONgkFmrin5Z3eiObza_044Wg3fHLUXiDz3TMFktd0VlAqWfhhkLZrF9JHy0zzYCFSJCL0jzgsAoBbaDy_rRj-RP0eCTyasl0oHJUs3BQgHWmJRaFA1hnc3WNrIf3Rsh8jCyDv5u9xl7PzMTXsBws4Mrd6vgGRytdqp-BuzQvJbWVEaRGkus9UMusCAjjlg1LDEIYthN8FQnCXX_3gT5DWlnnAXC2S6FBdJymSbfrkgsVkY_-g_PPx1ceGLgX6q6WxnL7Oof3pJ56SIjTszQ9xEOIk1mRmrNVgVumfu1LsFWgv0SRFMXyGKlYbTHjv_7cEcDmrjgky__uRyyqc5-ZUsF_9S1BgFfHMq9vcy7KQXAmRac0mRR8Psrnd3346wT15YyBSwkyg%3D%3D%7CixC2LMzFYaCtkl4MdiJPAA%3D%3D]]></Impression><AdTitle><![CDATA[ ]]></AdTitle><Description><![CDATA[ ]]></Description><Error><![CDATA[https://gotu.tpbid.com/vast/2?bid_id=4124e8b2e5c34d0b02021b8e50dca0d05bfaec52df969b6d6706a21f&cid=4650_4154bbf6a600a80b63c9171e94701ad2&crid=4650_86226f7b2a982e9cadfd8dc58d6965d0&lid=27f313dcd213beb73fb51378aeff34b7&g=l&a=error&code=[ERRORCODE]]]></Error><Impression><![CDATA[https://gotu.tpbid.com/vast/2?bid_id=4124e8b2e5c34d0b02021b8e50dca0d05bfaec52df969b6d6706a21f&cid=4650_4154bbf6a600a80b63c9171e94701ad2&crid=4650_86226f7b2a982e9cadfd8dc58d6965d0&lid=27f313dcd213beb73fb51378aeff34b7&g=l&t=0]]></Impression><Impression><![CDATA[https://gotu.tpbid.com/tsi?bid_id=4124e8b2e5c34d0b02021b8e50dca0d05bfaec52df969b6d6706a21f&cid=4650_4154bbf6a600a80b63c9171e94701ad2&pt=4rUVxgZ4Xk13tX1v5zDrNhCRAsZlHo4MmockDHvuO4p425Ov4Y_BchAgD-4ZBKZzv2t9LGDBgZm2_ytNH1AglytvY0bPqId8nwsksCRX6vqb1-GqVwUkk3ZIPcUtx8INSl..]]></Impression><Creatives><Creative id="1" sequence="1"><Linear><Duration>00:00:30</Duration><TrackingEvents><Tracking event="firstQuartile"><![CDATA[https://gotu.tpbid.com/vast/2?bid_id=4124e8b2e5c34d0b02021b8e50dca0d05bfaec52df969b6d6706a21f&cid=4650_4154bbf6a600a80b63c9171e94701ad2&crid=4650_86226f7b2a982e9cadfd8dc58d6965d0&lid=27f313dcd213beb73fb51378aeff34b7&g=l&t=25]]></Tracking><Tracking event="midpoint"><![CDATA[https://gotu.tpbid.com/vast/2?bid_id=4124e8b2e5c34d0b02021b8e50dca0d05bfaec52df969b6d6706a21f&cid=4650_4154bbf6a600a80b63c9171e94701ad2&crid=4650_86226f7b2a982e9cadfd8dc58d6965d0&lid=27f313dcd213beb73fb51378aeff34b7&g=l&t=50]]></Tracking><Tracking event="thirdQuartile"><![CDATA[https://gotu.tpbid.com/vast/2?bid_id=4124e8b2e5c34d0b02021b8e50dca0d05bfaec52df969b6d6706a21f&cid=4650_4154bbf6a600a80b63c9171e94701ad2&crid=4650_86226f7b2a982e9cadfd8dc58d6965d0&lid=27f313dcd213beb73fb51378aeff34b7&g=l&t=75]]></Tracking><Tracking event="complete"><![CDATA[https://gotu.tpbid.com/vast/2?bid_id=4124e8b2e5c34d0b02021b8e50dca0d05bfaec52df969b6d6706a21f&cid=4650_4154bbf6a600a80b63c9171e94701ad2&crid=4650_86226f7b2a982e9cadfd8dc58d6965d0&lid=27f313dcd213beb73fb51378aeff34b7&g=l&t=100]]></Tracking><Tracking event="start"><![CDATA[https://vet-us-east-1.track.smaato.net/start?sessionId=5251ca0e-bbf6-0e29-ae23-b7f26a5afde1&adSourceId=3b574e75-bf30-58a8-dd58-e1150fc75c7a&originalRequestTime=1728487967713&e=prebid]]></Tracking><Tracking event="firstQuartile"><![CDATA[https://vet-us-east-1.track.smaato.net/firstQuartile?sessionId=5251ca0e-bbf6-0e29-ae23-b7f26a5afde1&adSourceId=3b574e75-bf30-58a8-dd58-e1150fc75c7a&originalRequestTime=1728487967713&e=prebid]]></Tracking><Tracking event="midpoint"><![CDATA[https://vet-us-east-1.track.smaato.net/midpoint?sessionId=5251ca0e-bbf6-0e29-ae23-b7f26a5afde1&adSourceId=3b574e75-bf30-58a8-dd58-e1150fc75c7a&originalRequestTime=1728487967713&e=prebid]]></Tracking><Tracking event="thirdQuartile"><![CDATA[https://vet-us-east-1.track.smaato.net/thirdQuartile?sessionId=5251ca0e-bbf6-0e29-ae23-b7f26a5afde1&adSourceId=3b574e75-bf30-58a8-dd58-e1150fc75c7a&originalRequestTime=1728487967713&e=prebid]]></Tracking><Tracking event="complete"><![CDATA[https://vet-us-east-1.track.smaato.net/complete?sessionId=5251ca0e-bbf6-0e29-ae23-b7f26a5afde1&adSourceId=3b574e75-bf30-58a8-dd58-e1150fc75c7a&originalRequestTime=1728487967713&e=prebid]]></Tracking></TrackingEvents><VideoClicks><ClickThrough><![CDATA[https://gotu.tpbid.com/click?bid_id=4124e8b2e5c34d0b02021b8e50dca0d05bfaec52df969b6d6706a21f&ep0=eTw&ep1=uxWDmWxcmM&cf=1&ifap=1&esb=Jwd.2hwhSwMkC&cid=4650_4154bbf6a600a80b63c9171e94701ad2&crid=4650_86226f7b2a982e9cadfd8dc58d6965d0&lid=27f313dcd213beb73fb51378aeff34b7&xid=2&whoc=1&rt=4rUmY7Z4xh13tX1vTSIVDTo3As8Ft9F3h0ZZ9aWafHV-QPaaDpGhZK5YPiFpYamFv4tUnN6A5gOT3d-7zextn8P_-kd6C_daF2I9QjbeV3zhHl4Lt5A9R0H4tDEKzJB78WlR3W7QkqcEFAcEPNyqhl..&dmv=1]]></ClickThrough><ClickTracking><![CDATA[https://ets-us-east-1.track.smaato.net/v1/click?sessionId=5251ca0e-bbf6-0e29-ae23-b7f26a5afde1&adSourceId=3b574e75-bf30-58a8-dd58-e1150fc75c7a&originalRequestTime=1728487967713&e=prebid]]></ClickTracking></VideoClicks><MediaFiles><MediaFile bitrate="353" delivery="progressive" height="360" type="video/mp4" width="640"><![CDATA[https://c.tpbid.com/ssb/4650/videos/41e1c3e2b8873a195cbbcc524319e6bc.mp4]]></MediaFile></MediaFiles></Linear></Creative><Creative id="2" sequence="1"><CompanionAds><Companion height="320" width="480"><StaticResource creativeType="image/png"><![CDATA[https://c.tpbid.com/ssb/4650/images/e70b73e9da1c2c3dc6fdbafe9f96c494.jpg]]></StaticResource><CompanionClickThrough><![CDATA[https://gotu.tpbid.com/click?bid_id=4124e8b2e5c34d0b02021b8e50dca0d05bfaec52df969b6d6706a21f&ep0=eTw&ep1=uxWDmWxcmM&cf=1&ifap=1&esb=Jwd.2hwhSwMkC&cid=4650_4154bbf6a600a80b63c9171e94701ad2&crid=4650_86226f7b2a982e9cadfd8dc58d6965d0&lid=27f313dcd213beb73fb51378aeff34b7&xid=2&whoc=1&rt=4rUmY7Z4xh13tX1vTSIVDTo3As8Ft9F3h0ZZ9aWafHV-QPaaDpGhZK5YPiFpYamFv4tUnN6A5gOT3d-7zextn8P_-kd6C_daF2I9QjbeV3zhHl4Lt5A9R0H4tDEKzJB78WlR3W7QkqcEFAcEPNyqhl..&dmv=1]]></CompanionClickThrough><TrackingEvents><Tracking event="creativeView"><![CDATA[https://vet-us-east-1.track.smaato.net/companion/creativeView?sessionId=5251ca0e-bbf6-0e29-ae23-b7f26a5afde1&adSourceId=3b574e75-bf30-58a8-dd58-e1150fc75c7a&originalRequestTime=1728487967713&e=prebid]]></Tracking></TrackingEvents></Companion></CompanionAds></Creative></Creatives><Extensions><Extension type="Pricing"><Price currency="USD" model="CPM" source="smaato"><![CDATA[0.14087]]></Price></Extension></Extensions></InLine></Ad></VAST>`
tests := []struct {
name string
in string
out string
}{
{
name: "test",
in: "<AdTitle>&#xA; [ini:PDC][fmt:Video][crs:3682][csz:15s]&#xA; </AdTitle>",
out: "<AdTitle><![CDATA[[ini:PDC][fmt:Video][crs:3682][csz:15s]]]></AdTitle>",
}}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
doc := etree.NewDocument()
doc.WriteSettings.CanonicalEndTags = true

err := doc.ReadFromString(tt.in)
assert.Nil(t, err)

out, err := doc.WriteToString()
assert.Nil(t, err)
assert.Equal(t, tt.out, out)
})
}
return buf.String()
}

func TestCompareXMLParsers(t *testing.T) {
fileName := `./test/raw_vast.txt`
//fileName = `../../base64_vast.txt`
//$ cat *-prod.txt | sed -n 's/.*creative:\[\(.*\)\].*/\1/p' > $GOPATH/src/github.com/PubMatic-OpenWrap/prebid-server/endpoints/events/test/base64_vast.txt
type stats struct {
valid []int
generalMismatch []int
singleQuote []int
}

base64Decode := strings.Contains(fileName, "base64")
var (
//fileName = `./test/base64_vast.txt`
//fileName = `./test/base64_quoted_vast.txt`
fileName = `./test/raw_vast.txt`
quoted = strings.Contains(fileName, "quoted") //xml files retrived from prod vast unwrapper
base64Decode = strings.Contains(fileName, "base64")
debugLines = []int{}
st = stats{}
currentLine, xmlCount = 0, 0
)

file, err := os.Open(fileName)
if !assert.Nil(t, err) {
return
}

defer file.Close()
var mismatched, debugLines []int
line := 0

scanner := bufio.NewScanner(file)
scanner.Buffer(make([]byte, 0, 64*1024), 1024*1024)

//debugLines = []int{19, 24, 25, 29, 58, 80, 83, 84, 86, 93, 128, 147, 151, 155, 159, 168, 184, 190, 199, 200, 225, 226, 243, 249, 254, 261, 272, 281, 291, 298, 310, 312, 320, 323, 328, 340, 350, 358, 362, 373, 376, 384}
sort.Ints(debugLines)

for scanner.Scan() {
line++
currentLine++
vast := scanner.Text()
if len(debugLines) > 0 && sort.SearchInts(debugLines, line) == len(debugLines) {
continue
}

if base64Decode {
data, err := base64.StdEncoding.DecodeString(vast)
if !assert.Nil(t, err) {
continue
//presetup
{
//debug
if len(debugLines) > 0 {
if found := search(debugLines, currentLine); !found {
continue
}
}

//base64decode
if base64Decode {
data, err := base64.StdEncoding.DecodeString(vast)
if !assert.Nil(t, err) {
continue
}
vast = string(data)
if quoted {
vast = quoteUnescape(data)
}
}
vast = quoteUnescape(data)
}
t.Run(fmt.Sprintf("vast_%d", line), func(t *testing.T) {

t.Run(fmt.Sprintf("vast_%d", currentLine), func(t *testing.T) {
xmlCount++

etreeXML, _ := injectVideoEventsETree(vast, eventURLMap, false, adcom1.LinearityLinear)
fastXML, _ := injectVideoEventsFastXML(vast, eventURLMap, false, adcom1.LinearityLinear)

if vast != fastXML {
fastXML = tmpFastXMLProcessing(fastXML)
fastXML, etreeXML = tmpFastXMLProcessing(fastXML, etreeXML)
}

if !assert.Equal(t, etreeXML, fastXML) {
mismatched = append(mismatched, line)
if len(debugLines) > 0 {
assert.Equal(t, etreeXML, fastXML, vast)
}
})
}
t.Logf("\ntotal:[%v] mismatched:[%v] lines:[%v]", line, len(mismatched), mismatched)
assert.Equal(t, 0, len(mismatched))
assert.Nil(t, scanner.Err())
}

func TestBase64(t *testing.T) {
fileName := `./test/ow_failed.txt`

file, err := os.Open(fileName)
if !assert.Nil(t, err) {
return
if etreeXML != fastXML {
if idx := strings.Index(etreeXML, "&apos;"); idx != -1 &&
(strings.HasPrefix(fastXML[idx:], "&#39;") || strings.HasPrefix(fastXML[idx:], "\"")) {
st.singleQuote = append(st.singleQuote, currentLine)
} else {
st.generalMismatch = append(st.generalMismatch, currentLine)
}
return
}
st.valid = append(st.valid, currentLine)
})
}

defer file.Close()
var mismatched, errored, debugLines []int
var line, singleQuotePresent, maxLength int

maxLength = 14884
scanner := bufio.NewScanner(file)
scanner.Buffer(make([]byte, 0, 64*1024), 1024*1024)

//debugLines = []int{19, 24, 25, 29, 58, 80, 83, 84, 86, 93, 128, 147, 151, 155, 159, 168, 184, 190, 199, 200, 225, 226, 243, 249, 254, 261, 272, 281, 291, 298, 310, 312, 320, 323, 328, 340, 350, 358, 362, 373, 376, 384}
sort.Ints(debugLines)

for scanner.Scan() {
line++
value := scanner.Text()

if len(debugLines) > 0 && sort.SearchInts(debugLines, line) == len(debugLines) {
continue
}

vast, err := base64.RawStdEncoding.DecodeString(value[0:maxLength])

if err != nil {
errored = append(errored, line)
continue
}

if bytes.Contains(vast, []byte("'")) {
singleQuotePresent++
} else {
mismatched = append(mismatched, line)
}
}
assert.Empty(t, mismatched)
assert.Empty(t, errored)
t.Logf("\nTotal:[%v] validCount:[%v] generalMismatch:[%v] singleQuote:[%v]", xmlCount, st.valid, st.generalMismatch, st.singleQuote)
assert.NotZero(t, xmlCount)
assert.Equal(t, xmlCount, len(st.valid), "validXMLCount")
assert.Equal(t, 0, len(st.generalMismatch), "generalMismatch")
assert.Equal(t, 0, len(st.singleQuote), "singleQuote")
assert.Nil(t, scanner.Err())
}
6 changes: 3 additions & 3 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@ require (
git.pubmatic.com/PubMatic/go-common v0.0.0-20240313090142-97ff3d63b7c3
git.pubmatic.com/PubMatic/go-netacuity-client v0.0.0-20240104092757-5d6f15e25fe3
git.pubmatic.com/vastunwrap v0.0.0-00010101000000-000000000000
github.com/PubMatic-OpenWrap/fastxml v0.0.0-20240826060652-d9d5d05fdad2
github.com/PubMatic-OpenWrap/fastxml v0.0.0-20241125102315-0d8f851a6e52
github.com/beevik/etree/110 v0.0.0-00010101000000-000000000000
github.com/diegoholiveira/jsonlogic/v3 v3.5.3
github.com/go-sql-driver/mysql v1.7.1
github.com/golang/mock v1.6.0
Expand Down Expand Up @@ -88,7 +89,6 @@ require (
github.com/xeipuuv/gojsonpointer v0.0.0-20180127040702-4e3ac2762d5f // indirect
github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415 // indirect
github.com/yudai/golcs v0.0.0-20170316035057-ecda9a501e82 // indirect
github.com/yudai/pp v2.0.1+incompatible // indirect
golang.org/x/crypto v0.21.0 // indirect
golang.org/x/sys v0.18.0 // indirect
google.golang.org/genproto v0.0.0-20230410155749-daa745c078e1 // indirect
Expand All @@ -100,6 +100,6 @@ replace github.com/prebid/prebid-server/v2 => ./

replace github.com/prebid/openrtb/v20 => github.com/PubMatic-OpenWrap/prebid-openrtb/v20 v20.0.0-20240222072752-2d647d1707ef

replace github.com/beevik/etree v1.0.2 => github.com/PubMatic-OpenWrap/etree v1.0.2-0.20240914050009-a916f68552f5
replace github.com/beevik/etree v1.0.2 => github.com/PubMatic-OpenWrap/etree v1.0.2-0.20241125102329-0b5c47d99ad5

replace github.com/beevik/etree/110 => github.com/beevik/etree v1.1.0
Loading

0 comments on commit 0cef376

Please sign in to comment.