extract.yml


tests:
  mentions:
    - description: "Extract mention at the begining of a tweet"
      text: "@username reply"
      expected: ["username"]

    - description: "Extract mention at the end of a tweet"
      text: "mention @username"
      expected: ["username"]

    - description: "Extract mention in the middle of a tweet"
      text: "mention @username in the middle"
      expected: ["username"]

    - description: "Extract mention of username with underscore"
      text: "mention @user_name"
      expected: ["user_name"]

    - description: "Extract mention of all numeric username"
      text: "mention @12345"
      expected: ["12345"]

    - description: "Extract mention or multiple usernames"
      text: "mention @username1 @username2"
      expected: ["username1", "username2"]

    - description: "Extract mention in the middle of a Japanese tweet"
      text: "の@usernameに到着を待っている"
      expected: ["username"]

    - description: "DO NOT extract username ending in @"
      text: "Current Status: @_@ (cc: @username)"
      expected: ["username"]

    - description: "DO NOT extract username followed by accented latin characters"
      text: "@aliceìnheiro something something"
      expected: []

    - description: "Extract lone metion but not @user@user (too close to an email)"
      text: "@username email me @test@example.com"
      expected: ["username"]

  mentions_with_indices:
    - description: "Extract a mention at the start"
      text: "@username yo!"
      expected:
        - screen_name: "username"
          indices: [0, 9]

    - description: "Extract a mention that has the same thing mentioned at the start"
      text: "username @username"
      expected:
        - screen_name: "username"
          indices: [9, 18]

    - description: "Extract a mention in the middle of a Japanese tweet"
      text: "の@usernameに到着を待っている"
      expected:
        - screen_name: "username"
          indices: [1, 10]

  replies:
    - description: "Extract reply at the begining of a tweet"
      text: "@username reply"
      expected: "username"

    - description: "Extract reply preceded by only a space"
      text: " @username reply"
      expected: "username"

    - description: "Extract reply preceded by only a full-width space (U+3000)"
      text: "　@username reply"
      expected: "username"

    - description: "DO NOT Extract reply when preceded by text"
      text: "a @username mention, not a reply"
      expected:

    - description: "DO NOT Extract reply when preceded by ."
      text: ".@username mention, not a reply"
      expected:

    - description: "DO NOT Extract reply when preceded by /"
      text: "/@username mention, not a reply"
      expected:

    - description: "DO NOT Extract reply when preceded by _"
      text: "_@username mention, not a reply"
      expected:

    - description: "DO NOT Extract reply when preceded by -"
      text: "-@username mention, not a reply"
      expected:

    - description: "DO NOT Extract reply when preceded by +"
      text: "+@username mention, not a reply"
      expected:

    - description: "DO NOT Extract reply when preceded by #"
      text: "#@username mention, not a reply"
      expected:

    - description: "DO NOT Extract reply when preceded by !"
      text: "!@username mention, not a reply"
      expected:

    - description: "DO NOT Extract reply when preceded by @"
      text: "@@username mention, not a reply"
      expected:

  urls:
    - description: "Extract a lone URL"
      text: "http://example.com"
      expected: ["http://example.com"]

    - description: "Extract valid URL: http://google.com"
      text: "text http://google.com"
      expected: ["http://google.com"]

    - description: "Extract valid URL: http://foobar.com/#"
      text: "text http://foobar.com/#"
      expected: ["http://foobar.com/#"]

    - description: "Extract valid URL: http://google.com/#foo"
      text: "text http://google.com/#foo"
      expected: ["http://google.com/#foo"]

    - description: "Extract valid URL: http://google.com/#search?q=iphone%20-filter%3Alinks"
      text: "text http://google.com/#search?q=iphone%20-filter%3Alinks"
      expected: ["http://google.com/#search?q=iphone%20-filter%3Alinks"]

    - description: "Extract valid URL: http://twitter.com/#search?q=iphone%20-filter%3Alinks"
      text: "text http://twitter.com/#search?q=iphone%20-filter%3Alinks"
      expected: ["http://twitter.com/#search?q=iphone%20-filter%3Alinks"]

    - description: "Extract valid URL: http://somedomain.com/index.php?path=/abc/def/"
      text: "text http://somedomain.com/index.php?path=/abc/def/"
      expected: ["http://somedomain.com/index.php?path=/abc/def/"]

    - description: "Extract valid URL: http://www.boingboing.net/2007/02/14/katamari_damacy_phon.html"
      text: "text http://www.boingboing.net/2007/02/14/katamari_damacy_phon.html"
      expected: ["http://www.boingboing.net/2007/02/14/katamari_damacy_phon.html"]

    - description: "Extract valid URL: http://somehost.com:3000"
      text: "text http://somehost.com:3000"
      expected: ["http://somehost.com:3000"]

    - description: "Extract valid URL: http://xo.com/~matthew+%ff-x"
      text: "text http://xo.com/~matthew+%ff-x"
      expected: ["http://xo.com/~matthew+%ff-x"]

    - description: "Extract valid URL: http://xo.com/~matthew+%ff-,.;x"
      text: "text http://xo.com/~matthew+%ff-,.;x"
      expected: ["http://xo.com/~matthew+%ff-,.;x"]

    - description: "Extract valid URL: http://xo.com/,.;x"
      text: "text http://xo.com/,.;x"
      expected: ["http://xo.com/,.;x"]

    - description: "Extract valid URL: http://en.wikipedia.org/wiki/Primer_(film)"
      text: "text http://en.wikipedia.org/wiki/Primer_(film)"
      expected: ["http://en.wikipedia.org/wiki/Primer_(film)"]

    - description: "Extract valid URL: http://www.ams.org/bookstore-getitem/item=mbk-59"
      text: "text http://www.ams.org/bookstore-getitem/item=mbk-59"
      expected: ["http://www.ams.org/bookstore-getitem/item=mbk-59"]

    - description: "Extract valid URL: http://✪df.ws/ejp"
      text: "text http://✪df.ws/ejp"
      expected: ["http://✪df.ws/ejp"]

    - description: "Extract valid URL: http://chilp.it/?77e8fd"
      text: "text http://chilp.it/?77e8fd"
      expected: ["http://chilp.it/?77e8fd"]

    - description: "Extract valid URL: http://x.com/oneletterdomain"
      text: "text http://x.com/oneletterdomain"
      expected: ["http://x.com/oneletterdomain"]

    - description: "DO NOT extract invalid URL: http://domain-begin_dash_2314352345_dfasd.foo-cow_4352.com"
      text: "text http://domain-dash_2314352345_dfasd.foo-cow_4352.com"
      expected: []

    - description: "DO NOT extract invalid URL: http://-begin_dash_2314352345_dfasd.foo-cow_4352.com"
      text: "text http://-dash_2314352345_dfasd.foo-cow_4352.com"
      expected: []

    - description: "DO NOT extract invalid URL: http://no-tld"
      text: "text http://no-tld"
      expected: []

    - description: "DO NOT extract invalid URL: http://tld-too-short.x"
      text: "text http://tld-too-short.x"
      expected: []

    - description: "Extract a very long hyphenated sub-domain URL (single letter hyphens)"
      text: "text http://word-and-a-number-8-ftw.domain.tld/"
      expected: ["http://word-and-a-number-8-ftw.domain.tld/"]

    - description: "Extract a hyphenated TLD (usually a typo)"
      text: "text http://domain.tld-that-you-should-have-put-a-space-after"
      expected: ["http://domain.tld"]

    - description: "Extract URL ending with # value"
      text: "text http://foo.com?#foo text"
      expected: ["http://foo.com?#foo"]

    - description: "SHOULD NOT Extract URLs without protocol on (com|org|edu|gov|net) domains"
      text: "foo.com foo.net foo.org foo.edu foo.gov"
      expected: []

    - description: "DO NOT extract URLs withour protocol not on (com|org|edu|gov|net) domains, even when preceded by www."
      text: "foo.bar foo.co.jp www.foo.bar www.foo.co.uk wwwww.foo foo.comm foo.somecom foo.govedu"
      expected: []

    - description: "Extract URLs with a - or + at the end of the path"
      text: "Go to http://example.com/a+ or http://example.com/a-"
      expected: ["http://example.com/a+", "http://example.com/a-"]

    - description: "Extract URLs with longer paths ending in -"
      text: "Go to http://example.com/view/slug-url-?foo=bar"
      expected: ["http://example.com/view/slug-url-?foo=bar"]

    - description: "DO NOT extract URLs beginning with a space"
      text: "@user Try http:// example.com/path"
      expected: []

    - description: "DO NOT extract URLs beginning with a non-breaking space (U+00A0)"
      text: "@user Try http:// example.com/path"
      expected: []

    - description: "Extract URLs with underscores and dashes in the subdomain"
      text: "test http://sub_domain-dash.twitter.com"
      expected: ["http://sub_domain-dash.twitter.com"]

    - description: "Extract URL with minimum number of valid characters"
      text: "test http://a.b.cd"
      expected: ["http://a.b.cd"]

    - description: "Extract URLs containing underscores and dashes"
      text: "test http://a_b.c-d.com"
      expected: ["http://a_b.c-d.com"]

    - description: "Extract URLs containing dashes in the subdomain"
      text: "test http://a-b.c.com"
      expected: ["http://a-b.c.com"]

    - description: "Extract URLs with dashes in the domain name"
      text: "test http://twitter-dash.com"
      expected: ["http://twitter-dash.com"]

    - description: "Extract URLs with lots of symbols then a period"
      text: "http://www.bestbuy.com/site/Currie+Technologies+-+Ezip+400+Scooter/9885188.p?id=1218189013070&skuId=9885188"
      expected: ["http://www.bestbuy.com/site/Currie+Technologies+-+Ezip+400+Scooter/9885188.p?id=1218189013070&skuId=9885188"]

    - description: "DO NOT extract URLs containing leading dashes in the subdomain"
      text: "test http://-leadingdash.twitter.com"
      expected: []

    - description: "DO NOT extract URLs containing trailing dashes in the subdomain"
      text: "test http://trailingdash-.twitter.com"
      expected: []

    - description: "DO NOT extract URLs containing leading underscores in the subdomain"
      text: "test http://_leadingunderscore.twitter.com"
      expected: []

    - description: "DO NOT extract URLs containing trailing underscores in the subdomain"
      text: "test http://trailingunderscore_.twitter.com"
      expected: []

    - description: "DO NOT extract URLs containing leading dashes in the domain name"
      text: "test http://-twitter.com"
      expected: []

    - description: "DO NOT extract URLs containing trailing dashes in the domain name"
      text: "test http://twitter-.com"
      expected: []

    - description: "DO NOT extract URLs containing underscores in the domain name"
      text: "test http://twitter_underscore.com"
      expected: []

    - description: "DO NOT extract URLs containing underscores in the tld"
      text: "test http://twitter.c_o_m"
      expected: []

  urls_with_indices:
    - description: "Extract a URL"
      text: "text http://google.com"
      expected:
        - url: "http://google.com"
          indices: [5, 22]

    - description: "Extract a URL from a Japanese tweet"
      text: "皆さん見てください！ http://google.com"
      expected:
        - url: "http://google.com"
          indices: [11, 28]

  hashtags:
    - description: "Extract an all-alpha hashtag"
      text: "a #hashtag here"
      expected: ["hashtag"]

    - description: "Extract a letter-then-number hashtag"
      text: "this is #hashtag1"
      expected: ["hashtag1"]

    - description: "Extract a number-then-letter hashtag"
      text: "#1hashtag is this"
      expected: ["1hashtag"]

    - description: "DO NOT Extract an all-numeric hashtag"
      text: "On the #16 bus"
      expected: []

    - description: "Extract a hashtag containing ñ"
      text: "I'll write more tests #mañana"
      expected: ["mañana"]

    - description: "Extract a hashtag containing é"
      text: "Working remotely #café"
      expected: ["café"]

    - description: "Extract a hashtag containing ü"
      text: "Getting my Oktoberfest on #münchen"
      expected: ["münchen"]

    - description: "DO NOT Extract a hashtag containing Japanese"
      text: "this is not valid: # 会議中 ハッシュ"
      expected: []

    - description: "Extract a hashtag in Korean"
      text: "What is #트위터 anyway?"
      expected: ["트위터"]

    - description: "Extract a hashtag in Russian"
      text: "What is #ашок anyway?"
      expected: ["ашок"]

    - description: "Extract a starting katakana hashtag"
      text: "#カタカナ is a hashtag"
      expected: ["カタカナ"]

    - description: "Extract a starting hiragana hashtag"
      text: "#ひらがな FTW!"
      expected: ["ひらがな"]

    - description: "Extract a starting kanji hashtag"
      text: "#漢字 is the future"
      expected: ["漢字"]

    - description: "Extract a trailing katakana hashtag"
      text: "Hashtag #カタカナ"
      expected: ["カタカナ"]

    - description: "Extract a trailing hiragana hashtag"
      text: "Japanese hashtags #ひらがな"
      expected: ["ひらがな"]

    - description: "Extract a trailing kanji hashtag"
      text: "Study time #漢字"
      expected: ["漢字"]

    - description: "Extract a central katakana hashtag"
      text: "See my #カタカナ hashtag?"
      expected: ["カタカナ"]

    - description: "Extract a central hiragana hashtag"
      text: "Study #ひらがな for fun and profit"
      expected: ["ひらがな"]

    - description: "Extract a central kanji hashtag"
      text: "Some say #漢字 is the past. what do they know?"
      expected: ["漢字"]

    - description: "Extract a Kanji/Katakana mixed hashtag"
      text: "日本語ハッシュタグテスト #日本語ハッシュタグ"
      expected: ["日本語ハッシュタグ"]

    - description: "Extract a hashtag after a punctuation"
      text: "日本語ハッシュテスト。#日本語ハッシュタグ"
      expected: ["日本語ハッシュタグ"]

    - description: "DO NOT include a punctuation in a hashtag"
      text: "#日本語ハッシュタグ。"
      expected: ["日本語ハッシュタグ"]

    - description: "Extract a full-width Alnum hashtag"
      text: "全角英数字ハッシュタグ ＃ｈａｓｈｔａｇ１２３"
      expected: ["ｈａｓｈｔａｇ１２３"]

    - description: "DO NOT extract a hashtag without a preceding space"
      text: "日本語ハッシュタグ#日本語ハッシュタグ"
      expected: []

  hashtags_with_indices:
    - description: "Extract a hastag at the start"
      text: "#hashtag here"
      expected:
        - hashtag: "hashtag"
          indices: [0, 8]

    - description: "Extract a hastag at the end"
      text: "test a #hashtag"
      expected:
        - hashtag: "hashtag"
          indices: [7, 15]

    - description: "Extract a hastag in the middle"
      text: "test a #hashtag in a string"
      expected:
        - hashtag: "hashtag"
          indices: [7, 15]

    - description: "Extract only a valid hashtag"
      text: "#123 a #hashtag in a string"
      expected:
        - hashtag: "hashtag"
          indices: [7, 15]

    - description: "Extract a hastag in a string of multi-byte characters"
      text: "会議中 #hashtag 会議中"
      expected:
        - hashtag: "hashtag"
          indices: [4, 12]