+ * Attributes are treated as a map: there can be only one value associated with an attribute key/name. + *
+ *+ * Attribute name and value comparisons are case sensitive. By default for HTML, attribute names are + * normalized to lower-case on parsing. That means you should use lower-case strings when referring to attributes by + * name. + *
+ * + * + */ +open class Attributes: NSCopying { + + public static var dataPrefix: String = "data-" + + // Stored by lowercased key, but key case is checked against the copy inside + // the Attribute on retrieval. + var attributes: [Attribute] = [] + + public init() {} + + /** + Get an attribute value by key. + @param key the (case-sensitive) attribute key + @return the attribute value if set; or empty string if not set. + @see #hasKey(String) + */ + open func get(key: String) -> String { + if let attr = attributes.first(where: { $0.getKey() == key }) { + return attr.getValue() + } + return "" + } + + /** + * Get an attribute's value by case-insensitive key + * @param key the attribute name + * @return the first matching attribute value if set; or empty string if not set. + */ + open func getIgnoreCase(key: String )throws -> String { + try Validate.notEmpty(string: key) + if let attr = attributes.first(where: { $0.getKey().caseInsensitiveCompare(key) == .orderedSame }) { + return attr.getValue() + } + return "" + } + + /** + Set a new attribute, or replace an existing one by key. + @param key attribute key + @param value attribute value + */ + open func put(_ key: String, _ value: String) throws { + let attr = try Attribute(key: key, value: value) + put(attribute: attr) + } + + /** + Set a new boolean attribute, remove attribute if value is false. + @param key attribute key + @param value attribute value + */ + open func put(_ key: String, _ value: Bool) throws { + if (value) { + try put(attribute: BooleanAttribute(key: key)) + } else { + try remove(key: key) + } + } + + /** + Set a new attribute, or replace an existing one by (case-sensitive) key. + @param attribute attribute + */ + open func put(attribute: Attribute) { + let key = attribute.getKey() + if let ix = attributes.firstIndex(where: { $0.getKey() == key }) { + attributes[ix] = attribute + } else { + attributes.append(attribute) + } + } + + /** + Remove an attribute by key. Case sensitive. + @param key attribute key to remove + */ + open func remove(key: String)throws { + try Validate.notEmpty(string: key) + if let ix = attributes.firstIndex(where: { $0.getKey() == key }) { + attributes.remove(at: ix) } + } + + /** + Remove an attribute by key. Case insensitive. + @param key attribute key to remove + */ + open func removeIgnoreCase(key: String ) throws { + try Validate.notEmpty(string: key) + if let ix = attributes.firstIndex(where: { $0.getKey().caseInsensitiveCompare(key) == .orderedSame}) { + attributes.remove(at: ix) + } + } + + /** + Tests if these attributes contain an attribute with this key. + @param key case-sensitive key to check for + @return true if key exists, false otherwise + */ + open func hasKey(key: String) -> Bool { + return attributes.contains(where: { $0.getKey() == key }) + } + + /** + Tests if these attributes contain an attribute with this key. + @param key key to check for + @return true if key exists, false otherwise + */ + open func hasKeyIgnoreCase(key: String) -> Bool { + return attributes.contains(where: { $0.getKey().caseInsensitiveCompare(key) == .orderedSame}) + } + + /** + Get the number of attributes in this set. + @return size + */ + open func size() -> Int { + return attributes.count + } + + /** + Add all the attributes from the incoming set to this set. + @param incoming attributes to add to these attributes. + */ + open func addAll(incoming: Attributes?) { + guard let incoming = incoming else { return } + for attr in incoming.attributes { + put(attribute: attr) + } + } + + /** + Get the attributes as a List, for iteration. Do not modify the keys of the attributes via this view, as changes + to keys will not be recognised in the containing set. + @return an view of the attributes as a List. + */ + open func asList() -> [Attribute] { + return attributes + } + + /** + * Retrieves a filtered view of attributes that are HTML5 custom data attributes; that is, attributes with keys + * starting with {@code data-}. + * @return map of custom data attributes. + */ + open func dataset() -> [String: String] { + let prefixLength = Attributes.dataPrefix.count + let pairs = attributes.filter { $0.isDataAttribute() } + .map { ($0.getKey().substring(prefixLength), $0.getValue()) } + return Dictionary(uniqueKeysWithValues: pairs) + } + + /** + Get the HTML representation of these attributes. + @return HTML + @throws SerializationException if the HTML representation of the attributes cannot be constructed. + */ + open func html()throws -> String { + let accum = StringBuilder() + try html(accum: accum, out: Document("").outputSettings()) // output settings a bit funky, but this html() seldom used + return accum.toString() + } + + public func html(accum: StringBuilder, out: OutputSettings ) throws { + for attr in attributes { + accum.append(" ") + attr.html(accum: accum, out: out) + } + } + + open func toString()throws -> String { + return try html() + } + + /** + * Checks if these attributes are equal to another set of attributes, by comparing the two sets + * @param o attributes to compare with + * @return if both sets of attributes have the same content + */ + open func equals(o: AnyObject?) -> Bool { + if(o == nil) {return false} + if (self === o.self) {return true} + guard let that = o as? Attributes else {return false} + return (attributes == that.attributes) + } + + open func lowercaseAllKeys() { + for ix in attributes.indices { + attributes[ix].key = attributes[ix].key.lowercased() + } + } + + public func copy(with zone: NSZone? = nil) -> Any { + let clone = Attributes() + clone.attributes = attributes + return clone + } + + open func clone() -> Attributes { + return self.copy() as! Attributes + } + + fileprivate static func dataKey(key: String) -> String { + return dataPrefix + key + } + +} + +extension Attributes: Sequence { + public func makeIterator() -> AnyIterator+ * A selector is a chain of simple selectors, separated by combinators. Selectors are case insensitive (including against + * elements, attributes, and attribute values). + *
+ *+ * The universal selector (*) is implicit when no element selector is supplied (i.e. {@code *.header} and {@code .header} + * is equivalent). + *
+ *Pattern | Matches | Example | |
---|---|---|---|
* | any element | * | |
tag | elements with the given tag name | div | |
*|E | elements of type E in any namespace ns | *|name finds <fb:name> elements | |
ns|E | elements of type E in the namespace ns | fb|name finds <fb:name> elements | |
#id | elements with attribute ID of "id" | div#wrap , #logo | |
.class | elements with a class name of "class" | div.left , .result | |
[attr] | elements with an attribute named "attr" (with any value) | a[href] , [title] | |
[^attrPrefix] | elements with an attribute name starting with "attrPrefix". Use to find elements with HTML5 datasets | [^data-] , div[^data-] | |
[attr=val] | elements with an attribute named "attr", and value equal to "val" | img[width=500] , a[rel=nofollow] | |
[attr="val"] | elements with an attribute named "attr", and value equal to "val" | span[hello="Cleveland"][goodbye="Columbus"] , a[rel="nofollow"] | |
[attr^=valPrefix] | elements with an attribute named "attr", and value starting with "valPrefix" | a[href^=http:] | |
[attr$=valSuffix] | elements with an attribute named "attr", and value ending with "valSuffix" | img[src$=.png] | |
[attr*=valContaining] | elements with an attribute named "attr", and value containing "valContaining" | a[href*=/search/] | |
[attr~=regex] | elements with an attribute named "attr", and value matching the regular expression | img[src~=(?i)\\.(png|jpe?g)] | |
The above may be combined in any order | div.header[title] | ||
Combinators | |||
E F | an F element descended from an E element | div a , .logo h1 | |
E {@literal >} F | an F direct child of E | ol {@literal >} li | |
E + F | an F element immediately preceded by sibling E | li + li , div.head + div | |
E ~ F | an F element preceded by sibling E | h1 ~ p | |
E, F, G | all matching elements E, F, or G | a[href], div, h3 | |
Pseudo selectors | |||
:lt(n) | elements whose sibling index is less than n | td:lt(3) finds the first 3 cells of each row | |
:gt(n) | elements whose sibling index is greater than n | td:gt(1) finds cells after skipping the first two | |
:eq(n) | elements whose sibling index is equal to n | td:eq(0) finds the first cell of each row | |
:has(selector) | elements that contains at least one element matching the selector | div:has(p) finds divs that contain p elements | |
:not(selector) | elements that do not match the selector. See also {@link Elements#not(String)} | div:not(.logo) finds all divs that do not have the "logo" class.
| |
:contains(text) | elements that contains the specified text. The search is case insensitive. The text may appear in the found element, or any of its descendants. | p:contains(SwiftSoup) finds p elements containing the text "SwiftSoup". | |
:matches(regex) | elements whose text matches the specified regular expression. The text may appear in the found element, or any of its descendants. | td:matches(\\d+) finds table cells containing digits. div:matches((?i)login) finds divs containing the text, case insensitively. | |
:containsOwn(text) | elements that directly contain the specified text. The search is case insensitive. The text must appear in the found element, not any of its descendants. | p:containsOwn(SwiftSoup) finds p elements with own text "SwiftSoup". | |
:matchesOwn(regex) | elements whose own text matches the specified regular expression. The text must appear in the found element, not any of its descendants. | td:matchesOwn(\\d+) finds table cells directly containing digits. div:matchesOwn((?i)login) finds divs containing the text, case insensitively. | |
The above may be combined in any order and with other selectors | .light:contains(name):eq(0) | ||
Structural pseudo selectors | |||
:root | The element that is the root of the document. In HTML, this is the html element | :root | |
:nth-child(an+b) | elements that have :nth-child() can take odd and even as arguments instead. odd has the same signification as 2n+1 , and even has the same signification as 2n . | tr:nth-child(2n+1) finds every odd row of a table. :nth-child(10n-1) the 9th, 19th, 29th, etc, element. li:nth-child(5) the 5h li | |
:nth-last-child(an+b) | elements that have an+b-1 siblings after it in the document tree. Otherwise like :nth-child() | tr:nth-last-child(-n+2) the last two rows of a table | |
:nth-of-type(an+b) | pseudo-class notation represents an element that has an+b-1 siblings with the same expanded element name before it in the document tree, for any zero or positive integer value of n, and has a parent element | img:nth-of-type(2n+1) | |
:nth-last-of-type(an+b) | pseudo-class notation represents an element that has an+b-1 siblings with the same expanded element name after it in the document tree, for any zero or positive integer value of n, and has a parent element | img:nth-last-of-type(2n+1) | |
:first-child | elements that are the first child of some other element. | div {@literal >} p:first-child | |
:last-child | elements that are the last child of some other element. | ol {@literal >} li:last-child | |
:first-of-type | elements that are the first sibling of its type in the list of children of its parent element | dl dt:first-of-type | |
:last-of-type | elements that are the last sibling of its type in the list of children of its parent element | tr {@literal >} td:last-of-type | |
:only-child | elements that have a parent element and whose parent element hasve no other element children | ||
:only-of-type | an element that has a parent element and whose parent element has no other element children with the same expanded element name | ||
:empty | elements that have no children at all |
This enables + * {@link #updateMetaCharsetElement(boolean) meta charset update}.
+ * + *If there's no element with charset / encoding information yet it will + * be created. Obsolete charset / encoding definitions are removed!
+ * + *Elements used:
+ * + *If set to false (default) there are no elements + * modified.
+ * + * @param update If true the element updated on charset + * changes, false if not + * + * @see #charset(java.nio.charset.Charset) + */ + public func updateMetaCharsetElement(_ update: Bool) { + self.updateMetaCharset = update + } + + /** + * Returns whether the element with charset information in this document is + * updated on changes through {@link #charset(java.nio.charset.Charset) + * Document.charset(Charset)} or not. + * + * @return Returns true if the element is updated on charset + * changes, false if not + */ + public func updateMetaCharsetElement() -> Bool { + return updateMetaCharset + } + + /** + * Ensures a meta charset (html) or xml declaration (xml) with the current + * encoding used. This only applies with + * {@link #updateMetaCharsetElement(boolean) updateMetaCharset} set to + * true, otherwise this method does nothing. + * + *Elements used:
+ * + *base
, which provides a limited set of named HTML
+ * entities and escapes other characters as numbered entities for maximum compatibility; or extended
,
+ * which uses the complete set of HTML named entities.
+ *
+ * The default escape mode is base
.
+ * @return the document's current escape mode
+ */
+ public func escapeMode() -> Entities.EscapeMode {
+ return _escapeMode
+ }
+
+ /**
+ * Set the document's escape mode, which determines how characters are escaped when the output character set
+ * does not support a given character:- using either a named or a numbered escape.
+ * @param escapeMode the new escape mode to use
+ * @return the document's output settings, for chaining
+ */
+ @discardableResult
+ public func escapeMode(_ escapeMode: Entities.EscapeMode) -> OutputSettings {
+ self._escapeMode = escapeMode
+ return self
+ }
+
+ /**
+ * Get the document's current output charset, which is used to control which characters are escaped when
+ * generating HTML (via the html()
methods), and which are kept intact.
+ *
+ * Where possible (when parsing from a URL or File), the document's output charset is automatically set to the
+ * input charset. Otherwise, it defaults to UTF-8.
+ * @return the document's current charset.
+ */
+ public func encoder() -> String.Encoding {
+ return _encoder
+ }
+ public func charset() -> String.Encoding {
+ return _encoder
+ }
+
+ /**
+ * Update the document's output charset.
+ * @param charset the new charset to use.
+ * @return the document's output settings, for chaining
+ */
+ @discardableResult
+ public func encoder(_ encoder: String.Encoding) -> OutputSettings {
+ self._encoder = encoder
+ return self
+ }
+
+ @discardableResult
+ public func charset(_ e: String.Encoding) -> OutputSettings {
+ return encoder(e)
+ }
+
+ /**
+ * Get the document's current output syntax.
+ * @return current syntax
+ */
+ public func syntax() -> Syntax {
+ return _syntax
+ }
+
+ /**
+ * Set the document's output syntax. Either {@code html}, with empty tags and boolean attributes (etc), or
+ * {@code xml}, with self-closing tags.
+ * @param syntax serialization syntax
+ * @return the document's output settings, for chaining
+ */
+ @discardableResult
+ public func syntax(syntax: Syntax) -> OutputSettings {
+ _syntax = syntax
+ return self
+ }
+
+ /**
+ * Get if pretty printing is enabled. Default is true. If disabled, the HTML output methods will not re-format
+ * the output, and the output will generally look like the input.
+ * @return if pretty printing is enabled.
+ */
+ public func prettyPrint() -> Bool {
+ return _prettyPrint
+ }
+
+ /**
+ * Enable or disable pretty printing.
+ * @param pretty new pretty print setting
+ * @return this, for chaining
+ */
+ @discardableResult
+ public func prettyPrint(pretty: Bool) -> OutputSettings {
+ _prettyPrint = pretty
+ return self
+ }
+
+ /**
+ * Get if outline mode is enabled. Default is false. If enabled, the HTML output methods will consider
+ * all tags as block.
+ * @return if outline mode is enabled.
+ */
+ public func outline() -> Bool {
+ return _outline
+ }
+
+ /**
+ * Enable or disable HTML outline mode.
+ * @param outlineMode new outline setting
+ * @return this, for chaining
+ */
+ @discardableResult
+ public func outline(outlineMode: Bool) -> OutputSettings {
+ _outline = outlineMode
+ return self
+ }
+
+ /**
+ * Get the current tag indent amount, used when pretty printing.
+ * @return the current indent amount
+ */
+ public func indentAmount() -> UInt {
+ return _indentAmount
+ }
+
+ /**
+ * Set the indent amount for pretty printing
+ * @param indentAmount number of spaces to use for indenting each level. Must be {@literal >=} 0.
+ * @return this, for chaining
+ */
+ @discardableResult
+ public func indentAmount(indentAmount: UInt) -> OutputSettings {
+ _indentAmount = indentAmount
+ return self
+ }
+
+ public func copy(with zone: NSZone? = nil) -> Any {
+ let clone: OutputSettings = OutputSettings()
+ clone.charset(_encoder) // new charset and charset encoder
+ clone._escapeMode = _escapeMode//Entities.EscapeMode.valueOf(escapeMode.name())
+ // indentAmount, prettyPrint are primitives so object.clone() will handle
+ return clone
+ }
+
+}
diff --git a/Swiftgram/SwiftSoup/Sources/DocumentType.swift b/Swiftgram/SwiftSoup/Sources/DocumentType.swift
new file mode 100644
index 00000000000..95f9b10df31
--- /dev/null
+++ b/Swiftgram/SwiftSoup/Sources/DocumentType.swift
@@ -0,0 +1,129 @@
+//
+// DocumentType.swift
+// SwifSoup
+//
+// Created by Nabil Chatbi on 29/09/16.
+// Copyright © 2016 Nabil Chatbi.. All rights reserved.
+//
+
+import Foundation
+
+/**
+ * A {@code } node.
+ */
+public class DocumentType: Node {
+ static let PUBLIC_KEY: String = "PUBLIC"
+ static let SYSTEM_KEY: String = "SYSTEM"
+ private static let NAME: String = "name"
+ private static let PUB_SYS_KEY: String = "pubSysKey"; // PUBLIC or SYSTEM
+ private static let PUBLIC_ID: String = "publicId"
+ private static let SYSTEM_ID: String = "systemId"
+ // todo: quirk mode from publicId and systemId
+
+ /**
+ * Create a new doctype element.
+ * @param name the doctype's name
+ * @param publicId the doctype's public ID
+ * @param systemId the doctype's system ID
+ * @param baseUri the doctype's base URI
+ */
+ public init(_ name: String, _ publicId: String, _ systemId: String, _ baseUri: String) {
+ super.init(baseUri)
+ do {
+ try attr(DocumentType.NAME, name)
+ try attr(DocumentType.PUBLIC_ID, publicId)
+ if (has(DocumentType.PUBLIC_ID)) {
+ try attr(DocumentType.PUB_SYS_KEY, DocumentType.PUBLIC_KEY)
+ }
+ try attr(DocumentType.SYSTEM_ID, systemId)
+ } catch {}
+ }
+
+ /**
+ * Create a new doctype element.
+ * @param name the doctype's name
+ * @param publicId the doctype's public ID
+ * @param systemId the doctype's system ID
+ * @param baseUri the doctype's base URI
+ */
+ public init(_ name: String, _ pubSysKey: String?, _ publicId: String, _ systemId: String, _ baseUri: String) {
+ super.init(baseUri)
+ do {
+ try attr(DocumentType.NAME, name)
+ if(pubSysKey != nil) {
+ try attr(DocumentType.PUB_SYS_KEY, pubSysKey!)
+ }
+ try attr(DocumentType.PUBLIC_ID, publicId)
+ try attr(DocumentType.SYSTEM_ID, systemId)
+ } catch {}
+ }
+
+ public override func nodeName() -> String {
+ return "#doctype"
+ }
+
+ override func outerHtmlHead(_ accum: StringBuilder, _ depth: Int, _ out: OutputSettings) {
+ if (out.syntax() == OutputSettings.Syntax.html && !has(DocumentType.PUBLIC_ID) && !has(DocumentType.SYSTEM_ID)) {
+ // looks like a html5 doctype, go lowercase for aesthetics
+ accum.append("")
+ }
+
+ override func outerHtmlTail(_ accum: StringBuilder, _ depth: Int, _ out: OutputSettings) {
+ }
+
+ private func has(_ attribute: String) -> Bool {
+ do {
+ return !StringUtil.isBlank(try attr(attribute))
+ } catch {return false}
+ }
+
+ public override func copy(with zone: NSZone? = nil) -> Any {
+ let clone = DocumentType(attributes!.get(key: DocumentType.NAME),
+ attributes!.get(key: DocumentType.PUBLIC_ID),
+ attributes!.get(key: DocumentType.SYSTEM_ID),
+ baseUri!)
+ return copy(clone: clone)
+ }
+
+ public override func copy(parent: Node?) -> Node {
+ let clone = DocumentType(attributes!.get(key: DocumentType.NAME),
+ attributes!.get(key: DocumentType.PUBLIC_ID),
+ attributes!.get(key: DocumentType.SYSTEM_ID),
+ baseUri!)
+ return copy(clone: clone, parent: parent)
+ }
+
+ public override func copy(clone: Node, parent: Node?) -> Node {
+ return super.copy(clone: clone, parent: parent)
+ }
+
+}
diff --git a/Swiftgram/SwiftSoup/Sources/Element.swift b/Swiftgram/SwiftSoup/Sources/Element.swift
new file mode 100644
index 00000000000..630b9914bc2
--- /dev/null
+++ b/Swiftgram/SwiftSoup/Sources/Element.swift
@@ -0,0 +1,1316 @@
+//
+// Element.swift
+// SwifSoup
+//
+// Created by Nabil Chatbi on 29/09/16.
+// Copyright © 2016 Nabil Chatbi.. All rights reserved.
+//
+
+import Foundation
+
+open class Element: Node {
+ var _tag: Tag
+
+ private static let classString = "class"
+ private static let emptyString = ""
+ private static let idString = "id"
+ private static let rootString = "#root"
+
+ //private static let classSplit : Pattern = Pattern("\\s+")
+ private static let classSplit = "\\s+"
+
+ /**
+ * Create a new, standalone Element. (Standalone in that is has no parent.)
+ *
+ * @param tag tag of this element
+ * @param baseUri the base URI
+ * @param attributes initial attributes
+ * @see #appendChild(Node)
+ * @see #appendElement(String)
+ */
+ public init(_ tag: Tag, _ baseUri: String, _ attributes: Attributes) {
+ self._tag = tag
+ super.init(baseUri, attributes)
+ }
+ /**
+ * Create a new Element from a tag and a base URI.
+ *
+ * @param tag element tag
+ * @param baseUri the base URI of this element. It is acceptable for the base URI to be an empty
+ * string, but not null.
+ * @see Tag#valueOf(String, ParseSettings)
+ */
+ public init(_ tag: Tag, _ baseUri: String) {
+ self._tag = tag
+ super.init(baseUri, Attributes())
+ }
+
+ open override func nodeName() -> String {
+ return _tag.getName()
+ }
+ /**
+ * Get the name of the tag for this element. E.g. {@code div}
+ *
+ * @return the tag name
+ */
+ open func tagName() -> String {
+ return _tag.getName()
+ }
+ open func tagNameNormal() -> String {
+ return _tag.getNameNormal()
+ }
+
+ /**
+ * Change the tag of this element. For example, convert a {@code } to a {@code == false}).
+ *
+ * @return true if block, false if not (and thus inline)
+ */
+ open func isBlock() -> Bool {
+ return _tag.isBlock()
+ }
+
+ /**
+ * Get the {@code id} attribute of this element.
+ *
+ * @return The id attribute, if present, or an empty string if not.
+ */
+ open func id() -> String {
+ guard let attributes = attributes else {return Element.emptyString}
+ do {
+ return try attributes.getIgnoreCase(key: Element.idString)
+ } catch {}
+ return Element.emptyString
+ }
+
+ /**
+ * Set an attribute value on this element. If this element already has an attribute with the
+ * key, its value is updated; otherwise, a new attribute is added.
+ *
+ * @return this element
+ */
+ @discardableResult
+ open override func attr(_ attributeKey: String, _ attributeValue: String)throws->Element {
+ try super.attr(attributeKey, attributeValue)
+ return self
+ }
+
+ /**
+ * Set a boolean attribute value on this element. Setting to
+ * E.g., the element {@code
+ * This map is a filtered view of the element's attribute map. Changes to one map (add, remove, update) are reflected
+ * in the other map.
+ *
+ * You can find elements that have data attributes using the {@code [^data-]} attribute key prefix selector.
+ * @return a map of {@code key=value} custom data attributes.
+ */
+ open func dataset()->Dictionary
+ * Note that an element can have both mixed Nodes and Elements as children. This method inspects
+ * a filtered list of children that are elements, and the index is based on that filtered list.
+ *
+ * This is effectively a filter on {@link #childNodes()} to get Element nodes.
+ *
+ * This is effectively a filter on {@link #childNodes()} to get Text nodes.
+ * @return child text nodes. If this element has no text nodes, returns an
+ * empty list.
+ * One Two Three
+ * This is effectively a filter on {@link #childNodes()} to get Data nodes.
+ *
+ * This method is generally more powerful to use than the DOM-type {@code getElementBy*} methods, because
+ * multiple filters can be combined, e.g.:
+ *
+ * See the query syntax documentation in {@link CssSelector}.
+ *
+ * If the element has an ID, returns #id;
+ * otherwise returns the parent (if any) CSS selector, followed by {@literal '>'},
+ * followed by a unique selector for the element (tag.class.class:nth-child(n)).
+ *
+ * This is similar to {@link #nextSibling()}, but specifically finds only Elements
+ *
+ * Note that this finds the first matching ID, starting with this element. If you search down from a different
+ * starting point, it is possible to find a different element by ID. For unique element by ID within a Document,
+ * use {@link Document#getElementById(String)}
+ * @param id The ID to search for.
+ * @return The first matching element by ID, starting with this element, or null if none found.
+ */
+ public func getElementById(_ id: String)throws->Element? {
+ try Validate.notEmpty(string: id)
+
+ let elements: Elements = try Collector.collect(Evaluator.Id(id), self)
+ if (elements.array().count > 0) {
+ return elements.get(0)
+ } else {
+ return nil
+ }
+ }
+
+ /**
+ * Find elements that have this class, including or under this element. Case insensitive.
+ *
+ * Elements can have multiple classes (e.g. {@code
+ * For example, given HTML {@code Hello there now!
+ * For example, given HTML {@code Hello there now! }, would return
+ * {@code
+To get an {@code Elements} object, use the {@link Element#select(String)} method.
+
+ * Note that it is possible to get repeats if the matched elements contain both parent elements and their own
+ * children, as the Element.text() method returns the combined text of a parent and all its children.
+ * @return string of all text: unescaped and no HTML.
+ * @see Element#text()
+ */
+ open func text(trimAndNormaliseWhitespace: Bool = true)throws->String {
+ let sb: StringBuilder = StringBuilder()
+ for element: Element in this {
+ if !sb.isEmpty {
+ sb.append(" ")
+ }
+ sb.append(try element.text(trimAndNormaliseWhitespace: trimAndNormaliseWhitespace))
+ }
+ return sb.toString()
+ }
+
+ /// Check if an element has text
+ open func hasText() -> Bool {
+ for element: Element in this {
+ if (element.hasText()) {
+ return true
+ }
+ }
+ return false
+ }
+
+ /**
+ * Get the text content of each of the matched elements. If an element has no text, then it is not included in the
+ * result.
+ * @return A list of each matched element's text content.
+ * @see Element#text()
+ * @see Element#hasText()
+ * @see #text()
+ */
+ public func eachText()throws->Array This is SwiftSoup This is SwiftSoup
+ * This is useful for e.g removing unwanted formatting elements but keeping their contents.
+ * {@code {@code doc.select("font").unwrap();} HTML = {@code
+ * E.g. HTML: {@code Hello there now
+ * E.g. HTML: {@code Hello there
+ * Note that this method should not be used to clean user-submitted HTML; rather, use {@link Cleaner} to clean HTML.
+ * @return this, for chaining
+ * @see Element#empty()
+ * @see #empty()
+ */
+ @discardableResult
+ open func remove()throws->Elements {
+ for element in this {
+ try element.remove()
+ }
+ return self
+ }
+
+ // filters
+
+ /**
+ * Find matching elements within this element list.
+ * @param query A {@link CssSelector} query
+ * @return the filtered list of elements, or an empty list if none match.
+ */
+ open func select(_ query: String)throws->Elements {
+ return try CssSelector.select(query, this)
+ }
+
+ /**
+ * Remove elements from this list that match the {@link CssSelector} query.
+ *
+ * E.g. HTML: {@code
+ * @param query the selector query whose results should be removed from these elements
+ * @return a new elements list that contains only the filtered results
+ */
+ open func not(_ query: String)throws->Elements {
+ let out: Elements = try CssSelector.select(query, this)
+ return CssSelector.filterOut(this, out.this)
+ }
+
+ /**
+ * Get the nth matched element as an Elements object.
+ *
+ * See also {@link #get(int)} to retrieve an Element.
+ * @param index the (zero-based) index of the element in the list to retain
+ * @return Elements containing only the specified element, or, if that element did not exist, an empty list.
+ */
+ open func eq(_ index: Int) -> Elements {
+ return size() > index ? Elements([get(index)]) : Elements()
+ }
+
+ /**
+ * Test if any of the matched elements match the supplied query.
+ * @param query A selector
+ * @return true if at least one element in the list matches the query.
+ */
+ open func iS(_ query: String)throws->Bool {
+ let eval: Evaluator = try QueryParser.parse(query)
+ for e: Element in this {
+ if (try e.iS(eval)) {
+ return true
+ }
+ }
+ return false
+
+ }
+
+ /**
+ * Get all of the parents and ancestor elements of the matched elements.
+ * @return all of the parents and ancestor elements of the matched elements
+ */
+
+ open func parents() -> Elements {
+ let combo: OrderedSet
+ * To get an absolute URL from an attribute that may be a relative URL, prefix the key with
+ * E.g.:
+ * If the attribute value is already absolute (i.e. it starts with a protocol, like
+ *
+ * As an alternate, you can use the {@link #attr} method with the is remainder
+ if (wrapChildren.count > 0) {
+ for i in 0.. {@code {@code
+ * The cloned node may be adopted into another Document or node structure using {@link Element#appendChild(Node)}.
+ * @return stand-alone cloned node
+ */
+ public func copy(with zone: NSZone? = nil) -> Any {
+ return copy(clone: Node())
+ }
+
+ public func copy(parent: Node?) -> Node {
+ let clone = Node()
+ return copy(clone: clone, parent: parent)
+ }
+
+ public func copy(clone: Node) -> Node {
+ let thisClone: Node = copy(clone: clone, parent: nil) // splits for orphan
+
+ // Queue up nodes that need their children cloned (BFS).
+ var nodesToProcess: Array
+ * This interface provides two methods, {@code head} and {@code tail}. The head method is called when the node is first
+ * seen, and the tail method when all of the node's children have been visited. As an example, head can be used to
+ * create a start tag for a node, and tail to create the end tag.
+ *
+// * Use examples:
+// *
+// The encoding character set is determined by the content-type header or http-equiv meta tag, or falls back to {@code UTF-8}.
+//
+// @param url URL to fetch (with a GET). The protocol must be {@code http} or {@code https}.
+// @param timeoutMillis Connection and read timeout, in milliseconds. If exceeded, IOException is thrown.
+// @return The parsed HTML.
+//
+// @throws java.net.MalformedURLException if the request URL is not a HTTP or HTTPS URL, or is otherwise malformed
+// @throws HttpStatusException if the response is not OK and HTTP response errors are not ignored
+// @throws UnsupportedMimeTypeException if the response mime type is not supported and those errors are not ignored
+// @throws java.net.SocketTimeoutException if the connection times out
+// @throws IOException if a connection or read error occurs
+//
+// @see #connect(String)
+// */
+// public static func parse(_ url: URL, _ timeoutMillis: Int)throws->Document {
+// Connection con = HttpConnection.connect(url);
+// con.timeout(timeoutMillis);
+// return con.get();
+// }
+
+ /**
+ Get safe HTML from untrusted input HTML, by parsing input HTML and filtering it through a white-list of permitted
+ tags and attributes.
+
+ @param bodyHtml input untrusted HTML (body fragment)
+ @param baseUri URL to resolve relative URLs against
+ @param whitelist white-list of permitted HTML elements
+ @return safe HTML (body fragment)
+
+ @see Cleaner#clean(Document)
+ */
+ public func clean(_ bodyHtml: String, _ baseUri: String, _ whitelist: Whitelist)throws->String? {
+ let dirty: Document = try parseBodyFragment(bodyHtml, baseUri)
+ let cleaner: Cleaner = Cleaner(whitelist)
+ let clean: Document = try cleaner.clean(dirty)
+ return try clean.body()?.html()
+ }
+
+ /**
+ Get safe HTML from untrusted input HTML, by parsing input HTML and filtering it through a white-list of permitted
+ tags and attributes.
+
+ @param bodyHtml input untrusted HTML (body fragment)
+ @param whitelist white-list of permitted HTML elements
+ @return safe HTML (body fragment)
+
+ @see Cleaner#clean(Document)
+ */
+ public func clean(_ bodyHtml: String, _ whitelist: Whitelist)throws->String? {
+ return try SwiftSoup.clean(bodyHtml, "", whitelist)
+ }
+
+ /**
+ * Get safe HTML from untrusted input HTML, by parsing input HTML and filtering it through a white-list of
+ * permitted
+ * tags and attributes.
+ *
+ * @param bodyHtml input untrusted HTML (body fragment)
+ * @param baseUri URL to resolve relative URLs against
+ * @param whitelist white-list of permitted HTML elements
+ * @param outputSettings document output settings; use to control pretty-printing and entity escape modes
+ * @return safe HTML (body fragment)
+ * @see Cleaner#clean(Document)
+ */
+ public func clean(_ bodyHtml: String, _ baseUri: String, _ whitelist: Whitelist, _ outputSettings: OutputSettings)throws->String? {
+ let dirty: Document = try SwiftSoup.parseBodyFragment(bodyHtml, baseUri)
+ let cleaner: Cleaner = Cleaner(whitelist)
+ let clean: Document = try cleaner.clean(dirty)
+ clean.outputSettings(outputSettings)
+ return try clean.body()?.html()
+ }
+
+ /**
+ Test if the input HTML has only tags and attributes allowed by the Whitelist. Useful for form validation. The input HTML should
+ still be run through the cleaner to set up enforced attributes, and to tidy the output.
+ @param bodyHtml HTML to test
+ @param whitelist whitelist to test against
+ @return true if no tags or attributes were removed; false otherwise
+ @see #clean(String, Whitelist)
+ */
+ public func isValid(_ bodyHtml: String, _ whitelist: Whitelist)throws->Bool {
+ let dirty = try parseBodyFragment(bodyHtml, "")
+ let cleaner = Cleaner(whitelist)
+ return try cleaner.isValid(dirty)
+ }
diff --git a/Swiftgram/SwiftSoup/Sources/Tag.swift b/Swiftgram/SwiftSoup/Sources/Tag.swift
new file mode 100644
index 00000000000..ecdd27d84a8
--- /dev/null
+++ b/Swiftgram/SwiftSoup/Sources/Tag.swift
@@ -0,0 +1,347 @@
+//
+// Tag.swift
+// SwiftSoup
+//
+// Created by Nabil Chatbi on 15/10/16.
+// Copyright © 2016 Nabil Chatbi.. All rights reserved.
+//
+
+import Foundation
+
+open class Tag: Hashable {
+ // map of known tags
+ static var tags: Dictionary
+ * Pre-defined tags (P, DIV etc) will be ==, but unknown tags are not registered and will only .equals().
+ * )
+ tag = Tag(tagName)
+ tag!._isBlock = false
+ tag!._canContainBlock = true
+ }
+ }
+ return tag!
+ }
+
+ /**
+ * Get a Tag by name. If not previously defined (unknown), returns a new generic tag, that can do anything.
+ *
+ * Pre-defined tags (P, DIV etc) will be ==, but unknown tags are not registered and will only .equals().
+ *
+ Case insensitive.
+ * @param seq sequence to remove from head of queue.
+ */
+ open func consume(_ seq: String)throws {
+ if (!matches(seq)) {
+ //throw new IllegalStateException("Queue did not match expected sequence")
+ throw Exception.Error(type: ExceptionType.IllegalArgumentException, Message: "Queue did not match expected sequence")
+ }
+ let len = seq.count
+ if (len > remainingLength()) {
+ //throw new IllegalStateException("Queue not long enough to consume sequence")
+ throw Exception.Error(type: ExceptionType.IllegalArgumentException, Message: "Queue not long enough to consume sequence")
+ }
+
+ pos += len
+ }
+
+ /**
+ * Pulls a string off the queue, up to but exclusive of the match sequence, or to the queue running out.
+ * @param seq String to end on (and not include in return, but leave on queue). Case sensitive.
+ * @return The matched data consumed from queue.
+ */
+ @discardableResult
+ open func consumeTo(_ seq: String) -> String {
+ let offset = queue.indexOf(seq, pos)
+ if (offset != -1) {
+ let consumed = queue.substring(pos, offset-pos)
+ pos += consumed.count
+ return consumed
+ } else {
+ //return remainder()
+ }
+ return ""
+ }
+
+ open func consumeToIgnoreCase(_ seq: String) -> String {
+ let start = pos
+ let first = seq.substring(0, 1)
+ let canScan = first.lowercased() == first.uppercased() // if first is not cased, use index of
+ while (!isEmpty()) {
+ if (matches(seq)) {
+ break
+ }
+ if (canScan) {
+ let skip = queue.indexOf(first, pos) - pos
+ if (skip == 0) { // this char is the skip char, but not match, so force advance of pos
+ pos+=1
+ } else if (skip < 0) { // no chance of finding, grab to end
+ pos = queue.count
+ } else {
+ pos += skip
+ }
+ } else {
+ pos+=1
+ }
+ }
+
+ return queue.substring(start, pos-start)
+ }
+
+ /**
+ Consumes to the first sequence provided, or to the end of the queue. Leaves the terminator on the queue.
+ @param seq any number of terminators to consume to. Case insensitive.
+ @return consumed string
+ */
+ // todo: method name. not good that consumeTo cares for case, and consume to any doesn't. And the only use for this
+ // is is a case sensitive time...
+ open func consumeToAny(_ seq: String...) -> String {
+ return consumeToAny(seq)
+ }
+ open func consumeToAny(_ seq: [String]) -> String {
+ let start = pos
+ while (!isEmpty() && !matchesAny(seq)) {
+ pos+=1
+ }
+
+ return queue.substring(start, pos-start)
+ }
+ /**
+ * Pulls a string off the queue (like consumeTo), and then pulls off the matched string (but does not return it).
+ *
+ * If the queue runs out of characters before finding the seq, will return as much as it can (and queue will go
+ * isEmpty() == true).
+ * @param seq String to match up to, and not include in return, and to pull off queue. Case sensitive.
+ * @return Data matched from queue.
+ */
+ open func chompTo(_ seq: String) -> String {
+ let data = consumeTo(seq)
+ matchChomp(seq)
+ return data
+ }
+
+ open func chompToIgnoreCase(_ seq: String) -> String {
+ let data = consumeToIgnoreCase(seq) // case insensitive scan
+ matchChomp(seq)
+ return data
+ }
+
+ /**
+ * Pulls a balanced string off the queue. E.g. if queue is "(one (two) three) four", (,) will return "one (two) three",
+ * and leave " four" on the queue. Unbalanced openers and closers can quoted (with ' or ") or escaped (with \). Those escapes will be left
+ * in the returned string, which is suitable for regexes (where we need to preserve the escape), but unsuitable for
+ * contains text strings; use unescape for that.
+ * @param open opener
+ * @param close closer
+ * @return data matched from the queue
+ */
+ open func chompBalanced(_ open: Character, _ close: Character) -> String {
+ var start = -1
+ var end = -1
+ var depth = 0
+ var last: Character = TokenQueue.empty
+ var inQuote = false
+
+ repeat {
+ if (isEmpty()) {break}
+ let c = consume()
+ if (last == TokenQueue.empty || last != TokenQueue.ESC) {
+ if ((c=="'" || c=="\"") && c != open) {
+ inQuote = !inQuote
+ }
+ if (inQuote) {
+ continue
+ }
+ if (c==open) {
+ depth+=1
+ if (start == -1) {
+ start = pos
+ }
+ } else if (c==close) {
+ depth-=1
+ }
+ }
+
+ if (depth > 0 && last != TokenQueue.empty) {
+ end = pos // don't include the outer match pair in the return
+ }
+ last = c
+ } while (depth > 0)
+ return (end >= 0) ? queue.substring(start, end-start) : ""
+ }
+
+ /**
+ * Unescaped a \ escaped string.
+ * @param in backslash escaped string
+ * @return unescaped string
+ */
+ public static func unescape(_ input: String) -> String {
+ let out = StringBuilder()
+ var last = empty
+ for c in input {
+ if (c == ESC) {
+ if (last != empty && last == TokenQueue.ESC) {
+ out.append(c)
+ }
+ } else {
+ out.append(c)
+ }
+ last = c
+ }
+ return out.toString()
+ }
+
+ /**
+ * Pulls the next run of whitespace characters of the queue.
+ * @return Whether consuming whitespace or not
+ */
+ @discardableResult
+ open func consumeWhitespace() -> Bool {
+ var seen = false
+ while (matchesWhitespace()) {
+ pos+=1
+ seen = true
+ }
+ return seen
+ }
+
+ /**
+ * Retrieves the next run of word type (letter or digit) off the queue.
+ * @return String of word characters from queue, or empty string if none.
+ */
+ @discardableResult
+ open func consumeWord() -> String {
+ let start = pos
+ while (matchesWord()) {
+ pos+=1
+ }
+ return queue.substring(start, pos-start)
+ }
+
+ /**
+ * Consume an tag name off the queue (word or :, _, -)
+ *
+ * @return tag name
+ */
+ open func consumeTagName() -> String {
+ let start = pos
+ while (!isEmpty() && (matchesWord() || matchesAny(":", "_", "-"))) {
+ pos+=1
+ }
+
+ return queue.substring(start, pos-start)
+ }
+
+ /**
+ * Consume a CSS element selector (tag name, but | instead of : for namespaces (or *| for wildcard namespace), to not conflict with :pseudo selects).
+ *
+ * @return tag name
+ */
+ open func consumeElementSelector() -> String {
+ let start = pos
+ while (!isEmpty() && (matchesWord() || matchesAny("*|", "|", "_", "-"))) {
+ pos+=1
+ }
+
+ return queue.substring(start, pos-start)
+ }
+
+ /**
+ Consume a CSS identifier (ID or class) off the queue (letter, digit, -, _)
+ http://www.w3.org/TR/CSS2/syndata.html#value-def-identifier
+ @return identifier
+ */
+ open func consumeCssIdentifier() -> String {
+ let start = pos
+ while (!isEmpty() && (matchesWord() || matchesAny("-", "_"))) {
+ pos+=1
+ }
+
+ return queue.substring(start, pos-start)
+ }
+
+ /**
+ Consume an attribute key off the queue (letter, digit, -, _, :")
+ @return attribute key
+ */
+ open func consumeAttributeKey() -> String {
+ let start = pos
+ while (!isEmpty() && (matchesWord() || matchesAny("-", "_", ":"))) {
+ pos+=1
+ }
+
+ return queue.substring(start, pos-start)
+ }
+
+ /**
+ Consume and return whatever is left on the queue.
+ @return remained of queue.
+ */
+ open func remainder() -> String {
+ let remainder = queue.substring(pos, queue.count-pos)
+ pos = queue.count
+ return remainder
+ }
+
+ open func toString() -> String {
+ return queue.substring(pos)
+ }
+}
diff --git a/Swiftgram/SwiftSoup/Sources/Tokeniser.swift b/Swiftgram/SwiftSoup/Sources/Tokeniser.swift
new file mode 100644
index 00000000000..2fb5b59080c
--- /dev/null
+++ b/Swiftgram/SwiftSoup/Sources/Tokeniser.swift
@@ -0,0 +1,303 @@
+//
+// Tokeniser.swift
+// SwiftSoup
+//
+// Created by Nabil Chatbi on 19/10/16.
+// Copyright © 2016 Nabil Chatbi.. All rights reserved.
+//
+
+import Foundation
+
+final class Tokeniser {
+ static let replacementChar: UnicodeScalar = "\u{FFFD}" // replaces null character
+ private static let notCharRefCharsSorted: [UnicodeScalar] = [UnicodeScalar.BackslashT, "\n", "\r", UnicodeScalar.BackslashF, " ", "<", UnicodeScalar.Ampersand].sorted()
+
+ private let reader: CharacterReader // html input
+ private let errors: ParseErrorList? // errors found while tokenising
+
+ private var state: TokeniserState = TokeniserState.Data // current tokenisation state
+ private var emitPending: Token? // the token we are about to emit on next read
+ private var isEmitPending: Bool = false
+ private var charsString: String? // characters pending an emit. Will fall to charsBuilder if more than one
+ private let charsBuilder: StringBuilder = StringBuilder(1024) // buffers characters to output as one token, if more than one emit per read
+ let dataBuffer: StringBuilder = StringBuilder(1024) // buffers data looking for
+
+ var tagPending: Token.Tag = Token.Tag() // tag we are building up
+ let startPending: Token.StartTag = Token.StartTag()
+ let endPending: Token.EndTag = Token.EndTag()
+ let charPending: Token.Char = Token.Char()
+ let doctypePending: Token.Doctype = Token.Doctype() // doctype building up
+ let commentPending: Token.Comment = Token.Comment() // comment building up
+ private var lastStartTag: String? // the last start tag emitted, to test appropriate end tag
+ private var selfClosingFlagAcknowledged: Bool = true
+
+ init(_ reader: CharacterReader, _ errors: ParseErrorList?) {
+ self.reader = reader
+ self.errors = errors
+ }
+
+ func read()throws->Token {
+ if (!selfClosingFlagAcknowledged) {
+ error("Self closing flag not acknowledged")
+ selfClosingFlagAcknowledged = true
+ }
+
+ while (!isEmitPending) {
+ try state.read(self, reader)
+ }
+
+ // if emit is pending, a non-character token was found: return any chars in buffer, and leave token for next read:
+ if !charsBuilder.isEmpty {
+ let str: String = charsBuilder.toString()
+ charsBuilder.clear()
+ charsString = nil
+ return charPending.data(str)
+ } else if (charsString != nil) {
+ let token: Token = charPending.data(charsString!)
+ charsString = nil
+ return token
+ } else {
+ isEmitPending = false
+ return emitPending!
+ }
+ }
+
+ func emit(_ token: Token)throws {
+ try Validate.isFalse(val: isEmitPending, msg: "There is an unread token pending!")
+
+ emitPending = token
+ isEmitPending = true
+
+ if (token.type == Token.TokenType.StartTag) {
+ let startTag: Token.StartTag = token as! Token.StartTag
+ lastStartTag = startTag._tagName!
+ if (startTag._selfClosing) {
+ selfClosingFlagAcknowledged = false
+ }
+ } else if (token.type == Token.TokenType.EndTag) {
+ let endTag: Token.EndTag = token as! Token.EndTag
+ if (endTag._attributes.size() != 0) {
+ error("Attributes incorrectly present on end tag")
+ }
+ }
+ }
+
+ func emit(_ str: String ) {
+ // buffer strings up until last string token found, to emit only one token for a run of character refs etc.
+ // does not set isEmitPending; read checks that
+ if (charsString == nil) {
+ charsString = str
+ } else {
+ if charsBuilder.isEmpty { // switching to string builder as more than one emit before read
+ charsBuilder.append(charsString!)
+ }
+ charsBuilder.append(str)
+ }
+ }
+
+ func emit(_ chars: [UnicodeScalar]) {
+ emit(String(chars.map {Character($0)}))
+ }
+
+ // func emit(_ codepoints: [Int]) {
+ // emit(String(codepoints, 0, codepoints.length));
+ // }
+
+ func emit(_ c: UnicodeScalar) {
+ emit(String(c))
+ }
+
+ func getState() -> TokeniserState {
+ return state
+ }
+
+ func transition(_ state: TokeniserState) {
+ self.state = state
+ }
+
+ func advanceTransition(_ state: TokeniserState) {
+ reader.advance()
+ self.state = state
+ }
+
+ func acknowledgeSelfClosingFlag() {
+ selfClosingFlagAcknowledged = true
+ }
+
+ func consumeCharacterReference(_ additionalAllowedCharacter: UnicodeScalar?, _ inAttribute: Bool)throws->[UnicodeScalar]? {
+ if (reader.isEmpty()) {
+ return nil
+ }
+ if (additionalAllowedCharacter != nil && additionalAllowedCharacter == reader.current()) {
+ return nil
+ }
+ if (reader.matchesAnySorted(Tokeniser.notCharRefCharsSorted)) {
+ return nil
+ }
+
+ reader.markPos()
+ if (reader.matchConsume("#")) { // numbered
+ let isHexMode: Bool = reader.matchConsumeIgnoreCase("X")
+ let numRef: String = isHexMode ? reader.consumeHexSequence() : reader.consumeDigitSequence()
+ if (numRef.unicodeScalars.count == 0) { // didn't match anything
+ characterReferenceError("numeric reference with no numerals")
+ reader.rewindToMark()
+ return nil
+ }
+ if (!reader.matchConsume(";")) {
+ characterReferenceError("missing semicolon") // missing semi
+ }
+ var charval: Int = -1
+
+ let base: Int = isHexMode ? 16 : 10
+ if let num = Int(numRef, radix: base) {
+ charval = num
+ }
+
+ if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) {
+ characterReferenceError("character outside of valid range")
+ return [Tokeniser.replacementChar]
+ } else {
+ // todo: implement number replacement table
+ // todo: check for extra illegal unicode points as parse errors
+ return [UnicodeScalar(charval)!]
+ }
+ } else { // named
+ // get as many letters as possible, and look for matching entities.
+ let nameRef: String = reader.consumeLetterThenDigitSequence()
+ let looksLegit: Bool = reader.matches(";")
+ // found if a base named entity without a ;, or an extended entity with the ;.
+ let found: Bool = (Entities.isBaseNamedEntity(nameRef) || (Entities.isNamedEntity(nameRef) && looksLegit))
+
+ if (!found) {
+ reader.rewindToMark()
+ if (looksLegit) { // named with semicolon
+ characterReferenceError("invalid named referenece '\(nameRef)'")
+ }
+ return nil
+ }
+ if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny("=", "-", "_"))) {
+ // don't want that to match
+ reader.rewindToMark()
+ return nil
+ }
+ if (!reader.matchConsume(";")) {
+ characterReferenceError("missing semicolon") // missing semi
+ }
+ if let points = Entities.codepointsForName(nameRef) {
+ if points.count > 2 {
+ try Validate.fail(msg: "Unexpected characters returned for \(nameRef) num: \(points.count)")
+ }
+ return points
+ }
+ try Validate.fail(msg: "Entity name not found: \(nameRef)")
+ return []
+ }
+ }
+
+ @discardableResult
+ func createTagPending(_ start: Bool)->Token.Tag {
+ tagPending = start ? startPending.reset() : endPending.reset()
+ return tagPending
+ }
+
+ func emitTagPending()throws {
+ try tagPending.finaliseTag()
+ try emit(tagPending)
+ }
+
+ func createCommentPending() {
+ commentPending.reset()
+ }
+
+ func emitCommentPending()throws {
+ try emit(commentPending)
+ }
+
+ func createDoctypePending() {
+ doctypePending.reset()
+ }
+
+ func emitDoctypePending()throws {
+ try emit(doctypePending)
+ }
+
+ func createTempBuffer() {
+ Token.reset(dataBuffer)
+ }
+
+ func isAppropriateEndTagToken()throws->Bool {
+ if(lastStartTag != nil) {
+ let s = try tagPending.name()
+ return s.equalsIgnoreCase(string: lastStartTag!)
+ }
+ return false
+ }
+
+ func appropriateEndTagName() -> String? {
+ if (lastStartTag == nil) {
+ return nil
+ }
+ return lastStartTag
+ }
+
+ func error(_ state: TokeniserState) {
+ if (errors != nil && errors!.canAddError()) {
+ errors?.add(ParseError(reader.getPos(), "Unexpected character '\(String(reader.current()))' in input state [\(state.description)]"))
+ }
+ }
+
+ func eofError(_ state: TokeniserState) {
+ if (errors != nil && errors!.canAddError()) {
+ errors?.add(ParseError(reader.getPos(), "Unexpectedly reached end of file (EOF) in input state [\(state.description)]"))
+ }
+ }
+
+ private func characterReferenceError(_ message: String) {
+ if (errors != nil && errors!.canAddError()) {
+ errors?.add(ParseError(reader.getPos(), "Invalid character reference: \(message)"))
+ }
+ }
+
+ private func error(_ errorMsg: String) {
+ if (errors != nil && errors!.canAddError()) {
+ errors?.add(ParseError(reader.getPos(), errorMsg))
+ }
+ }
+
+ func currentNodeInHtmlNS() -> Bool {
+ // todo: implement namespaces correctly
+ return true
+ // Element currentNode = currentNode()
+ // return currentNode != null && currentNode.namespace().equals("HTML")
+ }
+
+ /**
+ * Utility method to consume reader and unescape entities found within.
+ * @param inAttribute
+ * @return unescaped string from reader
+ */
+ func unescapeEntities(_ inAttribute: Bool)throws->String {
+ let builder: StringBuilder = StringBuilder()
+ while (!reader.isEmpty()) {
+ builder.append(reader.consumeTo(UnicodeScalar.Ampersand))
+ if (reader.matches(UnicodeScalar.Ampersand)) {
+ reader.consume()
+ if let c = try consumeCharacterReference(nil, inAttribute) {
+ if (c.count==0) {
+ builder.append(UnicodeScalar.Ampersand)
+ } else {
+ builder.appendCodePoint(c[0])
+ if (c.count == 2) {
+ builder.appendCodePoint(c[1])
+ }
+ }
+ } else {
+ builder.append(UnicodeScalar.Ampersand)
+ }
+ }
+ }
+ return builder.toString()
+ }
+
+}
diff --git a/Swiftgram/SwiftSoup/Sources/TokeniserState.swift b/Swiftgram/SwiftSoup/Sources/TokeniserState.swift
new file mode 100644
index 00000000000..707248a83bc
--- /dev/null
+++ b/Swiftgram/SwiftSoup/Sources/TokeniserState.swift
@@ -0,0 +1,1644 @@
+//
+// TokeniserState.swift
+// SwiftSoup
+//
+// Created by Nabil Chatbi on 12/10/16.
+// Copyright © 2016 Nabil Chatbi.. All rights reserved.
+//
+
+import Foundation
+
+protocol TokeniserStateProtocol {
+ func read(_ t: Tokeniser, _ r: CharacterReader)throws
+}
+
+public class TokeniserStateVars {
+ public static let nullScalr: UnicodeScalar = "\u{0000}"
+
+ static let attributeSingleValueCharsSorted = ["'", UnicodeScalar.Ampersand, nullScalr].sorted()
+ static let attributeDoubleValueCharsSorted = ["\"", UnicodeScalar.Ampersand, nullScalr].sorted()
+ static let attributeNameCharsSorted = [UnicodeScalar.BackslashT, "\n", "\r", UnicodeScalar.BackslashF, " ", "/", "=", ">", nullScalr, "\"", "'", UnicodeScalar.LessThan].sorted()
+ static let attributeValueUnquoted = [UnicodeScalar.BackslashT, "\n", "\r", UnicodeScalar.BackslashF, " ", UnicodeScalar.Ampersand, ">", nullScalr, "\"", "'", UnicodeScalar.LessThan, "=", "`"].sorted()
+
+ static let replacementChar: UnicodeScalar = Tokeniser.replacementChar
+ static let replacementStr: String = String(Tokeniser.replacementChar)
+ static let eof: UnicodeScalar = CharacterReader.EOF
+}
+
+enum TokeniserState: TokeniserStateProtocol {
+ case Data
+ case CharacterReferenceInData
+ case Rcdata
+ case CharacterReferenceInRcdata
+ case Rawtext
+ case ScriptData
+ case PLAINTEXT
+ case TagOpen
+ case EndTagOpen
+ case TagName
+ case RcdataLessthanSign
+ case RCDATAEndTagOpen
+ case RCDATAEndTagName
+ case RawtextLessthanSign
+ case RawtextEndTagOpen
+ case RawtextEndTagName
+ case ScriptDataLessthanSign
+ case ScriptDataEndTagOpen
+ case ScriptDataEndTagName
+ case ScriptDataEscapeStart
+ case ScriptDataEscapeStartDash
+ case ScriptDataEscaped
+ case ScriptDataEscapedDash
+ case ScriptDataEscapedDashDash
+ case ScriptDataEscapedLessthanSign
+ case ScriptDataEscapedEndTagOpen
+ case ScriptDataEscapedEndTagName
+ case ScriptDataDoubleEscapeStart
+ case ScriptDataDoubleEscaped
+ case ScriptDataDoubleEscapedDash
+ case ScriptDataDoubleEscapedDashDash
+ case ScriptDataDoubleEscapedLessthanSign
+ case ScriptDataDoubleEscapeEnd
+ case BeforeAttributeName
+ case AttributeName
+ case AfterAttributeName
+ case BeforeAttributeValue
+ case AttributeValue_doubleQuoted
+ case AttributeValue_singleQuoted
+ case AttributeValue_unquoted
+ case AfterAttributeValue_quoted
+ case SelfClosingStartTag
+ case BogusComment
+ case MarkupDeclarationOpen
+ case CommentStart
+ case CommentStartDash
+ case Comment
+ case CommentEndDash
+ case CommentEnd
+ case CommentEndBang
+ case Doctype
+ case BeforeDoctypeName
+ case DoctypeName
+ case AfterDoctypeName
+ case AfterDoctypePublicKeyword
+ case BeforeDoctypePublicIdentifier
+ case DoctypePublicIdentifier_doubleQuoted
+ case DoctypePublicIdentifier_singleQuoted
+ case AfterDoctypePublicIdentifier
+ case BetweenDoctypePublicAndSystemIdentifiers
+ case AfterDoctypeSystemKeyword
+ case BeforeDoctypeSystemIdentifier
+ case DoctypeSystemIdentifier_doubleQuoted
+ case DoctypeSystemIdentifier_singleQuoted
+ case AfterDoctypeSystemIdentifier
+ case BogusDoctype
+ case CdataSection
+
+ internal func read(_ t: Tokeniser, _ r: CharacterReader)throws {
+ switch self {
+ case .Data:
+ switch (r.current()) {
+ case UnicodeScalar.Ampersand:
+ t.advanceTransition(.CharacterReferenceInData)
+ break
+ case UnicodeScalar.LessThan:
+ t.advanceTransition(.TagOpen)
+ break
+ case TokeniserStateVars.nullScalr:
+ t.error(self) // NOT replacement character (oddly?)
+ t.emit(r.consume())
+ break
+ case TokeniserStateVars.eof:
+ try t.emit(Token.EOF())
+ break
+ default:
+ let data: String = r.consumeData()
+ t.emit(data)
+ break
+ }
+ break
+ case .CharacterReferenceInData:
+ try TokeniserState.readCharRef(t, .Data)
+ break
+ case .Rcdata:
+ switch (r.current()) {
+ case UnicodeScalar.Ampersand:
+ t.advanceTransition(.CharacterReferenceInRcdata)
+ break
+ case UnicodeScalar.LessThan:
+ t.advanceTransition(.RcdataLessthanSign)
+ break
+ case TokeniserStateVars.nullScalr:
+ t.error(self)
+ r.advance()
+ t.emit(TokeniserStateVars.replacementChar)
+ break
+ case TokeniserStateVars.eof:
+ try t.emit(Token.EOF())
+ break
+ default:
+ let data = r.consumeToAny(UnicodeScalar.Ampersand, UnicodeScalar.LessThan, TokeniserStateVars.nullScalr)
+ t.emit(data)
+ break
+ }
+ break
+ case .CharacterReferenceInRcdata:
+ try TokeniserState.readCharRef(t, .Rcdata)
+ break
+ case .Rawtext:
+ try TokeniserState.readData(t, r, self, .RawtextLessthanSign)
+ break
+ case .ScriptData:
+ try TokeniserState.readData(t, r, self, .ScriptDataLessthanSign)
+ break
+ case .PLAINTEXT:
+ switch (r.current()) {
+ case TokeniserStateVars.nullScalr:
+ t.error(self)
+ r.advance()
+ t.emit(TokeniserStateVars.replacementChar)
+ break
+ case TokeniserStateVars.eof:
+ try t.emit(Token.EOF())
+ break
+ default:
+ let data = r.consumeTo(TokeniserStateVars.nullScalr)
+ t.emit(data)
+ break
+ }
+ break
+ case .TagOpen:
+ // from < in data
+ switch (r.current()) {
+ case "!":
+ t.advanceTransition(.MarkupDeclarationOpen)
+ break
+ case "/":
+ t.advanceTransition(.EndTagOpen)
+ break
+ case "?":
+ t.advanceTransition(.BogusComment)
+ break
+ default:
+ if (r.matchesLetter()) {
+ t.createTagPending(true)
+ t.transition(.TagName)
+ } else {
+ t.error(self)
+ t.emit(UnicodeScalar.LessThan) // char that got us here
+ t.transition(.Data)
+ }
+ break
+ }
+ break
+ case .EndTagOpen:
+ if (r.isEmpty()) {
+ t.eofError(self)
+ t.emit("")
+ t.transition(.Data)
+ } else if (r.matchesLetter()) {
+ t.createTagPending(false)
+ t.transition(.TagName)
+ } else if (r.matches(">")) {
+ t.error(self)
+ t.advanceTransition(.Data)
+ } else {
+ t.error(self)
+ t.advanceTransition(.BogusComment)
+ }
+ break
+ case .TagName:
+ // from < or in data, will have start or end tag pending
+ // previous TagOpen state did NOT consume, will have a letter char in current
+ //String tagName = r.consumeToAnySorted(tagCharsSorted).toLowerCase()
+ let tagName = r.consumeTagName()
+ t.tagPending.appendTagName(tagName)
+
+ switch (r.consume()) {
+ case UnicodeScalar.BackslashT:
+ t.transition(.BeforeAttributeName)
+ break
+ case "\n":
+ t.transition(.BeforeAttributeName)
+ break
+ case "\r":
+ t.transition(.BeforeAttributeName)
+ break
+ case UnicodeScalar.BackslashF:
+ t.transition(.BeforeAttributeName)
+ break
+ case " ":
+ t.transition(.BeforeAttributeName)
+ break
+ case "/":
+ t.transition(.SelfClosingStartTag)
+ break
+ case ">":
+ try t.emitTagPending()
+ t.transition(.Data)
+ break
+ case TokeniserStateVars.nullScalr: // replacement
+ t.tagPending.appendTagName(TokeniserStateVars.replacementStr)
+ break
+ case TokeniserStateVars.eof: // should emit pending tag?
+ t.eofError(self)
+ t.transition(.Data)
+ // no default, as covered with above consumeToAny
+ default:
+ break
+ }
+ case .RcdataLessthanSign:
+ if (r.matches("/")) {
+ t.createTempBuffer()
+ t.advanceTransition(.RCDATAEndTagOpen)
+ } else if (r.matchesLetter() && t.appropriateEndTagName() != nil && !r.containsIgnoreCase("" + t.appropriateEndTagName()!)) {
+ // diverge from spec: got a start tag, but there's no appropriate end tag (), so rather than
+ // consuming to EOF break out here
+ t.tagPending = t.createTagPending(false).name(t.appropriateEndTagName()!)
+ try t.emitTagPending()
+ r.unconsume() // undo UnicodeScalar.LessThan
+ t.transition(.Data)
+ } else {
+ t.emit(UnicodeScalar.LessThan)
+ t.transition(.Rcdata)
+ }
+ break
+ case .RCDATAEndTagOpen:
+ if (r.matchesLetter()) {
+ t.createTagPending(false)
+ t.tagPending.appendTagName(r.current())
+ t.dataBuffer.append(r.current())
+ t.advanceTransition(.RCDATAEndTagName)
+ } else {
+ t.emit("")
+ t.transition(.Rcdata)
+ }
+ break
+ case .RCDATAEndTagName:
+ if (r.matchesLetter()) {
+ let name = r.consumeLetterSequence()
+ t.tagPending.appendTagName(name)
+ t.dataBuffer.append(name)
+ return
+ }
+
+ func anythingElse(_ t: Tokeniser, _ r: CharacterReader) {
+ t.emit("" + t.dataBuffer.toString())
+ r.unconsume()
+ t.transition(.Rcdata)
+ }
+
+ let c = r.consume()
+ switch (c) {
+ case UnicodeScalar.BackslashT:
+ if (try t.isAppropriateEndTagToken()) {
+ t.transition(.BeforeAttributeName)
+ } else {
+ anythingElse(t, r)
+ }
+ break
+ case "\n":
+ if (try t.isAppropriateEndTagToken()) {
+ t.transition(.BeforeAttributeName)
+ } else {
+ anythingElse(t, r)
+ }
+ break
+ case "\r":
+ if (try t.isAppropriateEndTagToken()) {
+ t.transition(.BeforeAttributeName)
+ } else {
+ anythingElse(t, r)
+ }
+ break
+ case UnicodeScalar.BackslashF:
+ if (try t.isAppropriateEndTagToken()) {
+ t.transition(.BeforeAttributeName)
+ } else {
+ anythingElse(t, r)
+ }
+ break
+ case " ":
+ if (try t.isAppropriateEndTagToken()) {
+ t.transition(.BeforeAttributeName)
+ } else {
+ anythingElse(t, r)
+ }
+ break
+ case "/":
+ if (try t.isAppropriateEndTagToken()) {
+ t.transition(.SelfClosingStartTag)
+ } else {
+ anythingElse(t, r)
+ }
+ break
+ case ">":
+ if (try t.isAppropriateEndTagToken()) {
+ try t.emitTagPending()
+ t.transition(.Data)
+ } else {anythingElse(t, r)}
+ break
+ default:
+ anythingElse(t, r)
+ break
+ }
+ break
+ case .RawtextLessthanSign:
+ if (r.matches("/")) {
+ t.createTempBuffer()
+ t.advanceTransition(.RawtextEndTagOpen)
+ } else {
+ t.emit(UnicodeScalar.LessThan)
+ t.transition(.Rawtext)
+ }
+ break
+ case .RawtextEndTagOpen:
+ TokeniserState.readEndTag(t, r, .RawtextEndTagName, .Rawtext)
+ break
+ case .RawtextEndTagName:
+ try TokeniserState.handleDataEndTag(t, r, .Rawtext)
+ break
+ case .ScriptDataLessthanSign:
+ switch (r.consume()) {
+ case "/":
+ t.createTempBuffer()
+ t.transition(.ScriptDataEndTagOpen)
+ break
+ case "!":
+ t.emit("":
+ t.emit(c)
+ t.transition(.ScriptData)
+ break
+ case TokeniserStateVars.nullScalr:
+ t.error(self)
+ t.emit(TokeniserStateVars.replacementChar)
+ t.transition(.ScriptDataEscaped)
+ break
+ default:
+ t.emit(c)
+ t.transition(.ScriptDataEscaped)
+ }
+ break
+ case .ScriptDataEscapedLessthanSign:
+ if (r.matchesLetter()) {
+ t.createTempBuffer()
+ t.dataBuffer.append(r.current())
+ t.emit("<" + String(r.current()))
+ t.advanceTransition(.ScriptDataDoubleEscapeStart)
+ } else if (r.matches("/")) {
+ t.createTempBuffer()
+ t.advanceTransition(.ScriptDataEscapedEndTagOpen)
+ } else {
+ t.emit(UnicodeScalar.LessThan)
+ t.transition(.ScriptDataEscaped)
+ }
+ break
+ case .ScriptDataEscapedEndTagOpen:
+ if (r.matchesLetter()) {
+ t.createTagPending(false)
+ t.tagPending.appendTagName(r.current())
+ t.dataBuffer.append(r.current())
+ t.advanceTransition(.ScriptDataEscapedEndTagName)
+ } else {
+ t.emit("")
+ t.transition(.ScriptDataEscaped)
+ }
+ break
+ case .ScriptDataEscapedEndTagName:
+ try TokeniserState.handleDataEndTag(t, r, .ScriptDataEscaped)
+ break
+ case .ScriptDataDoubleEscapeStart:
+ TokeniserState.handleDataDoubleEscapeTag(t, r, .ScriptDataDoubleEscaped, .ScriptDataEscaped)
+ break
+ case .ScriptDataDoubleEscaped:
+ let c = r.current()
+ switch (c) {
+ case "-":
+ t.emit(c)
+ t.advanceTransition(.ScriptDataDoubleEscapedDash)
+ break
+ case UnicodeScalar.LessThan:
+ t.emit(c)
+ t.advanceTransition(.ScriptDataDoubleEscapedLessthanSign)
+ break
+ case TokeniserStateVars.nullScalr:
+ t.error(self)
+ r.advance()
+ t.emit(TokeniserStateVars.replacementChar)
+ break
+ case TokeniserStateVars.eof:
+ t.eofError(self)
+ t.transition(.Data)
+ break
+ default:
+ let data = r.consumeToAny("-", UnicodeScalar.LessThan, TokeniserStateVars.nullScalr)
+ t.emit(data)
+ }
+ break
+ case .ScriptDataDoubleEscapedDash:
+ let c = r.consume()
+ switch (c) {
+ case "-":
+ t.emit(c)
+ t.transition(.ScriptDataDoubleEscapedDashDash)
+ break
+ case UnicodeScalar.LessThan:
+ t.emit(c)
+ t.transition(.ScriptDataDoubleEscapedLessthanSign)
+ break
+ case TokeniserStateVars.nullScalr:
+ t.error(self)
+ t.emit(TokeniserStateVars.replacementChar)
+ t.transition(.ScriptDataDoubleEscaped)
+ break
+ case TokeniserStateVars.eof:
+ t.eofError(self)
+ t.transition(.Data)
+ break
+ default:
+ t.emit(c)
+ t.transition(.ScriptDataDoubleEscaped)
+ }
+ break
+ case .ScriptDataDoubleEscapedDashDash:
+ let c = r.consume()
+ switch (c) {
+ case "-":
+ t.emit(c)
+ break
+ case UnicodeScalar.LessThan:
+ t.emit(c)
+ t.transition(.ScriptDataDoubleEscapedLessthanSign)
+ break
+ case ">":
+ t.emit(c)
+ t.transition(.ScriptData)
+ break
+ case TokeniserStateVars.nullScalr:
+ t.error(self)
+ t.emit(TokeniserStateVars.replacementChar)
+ t.transition(.ScriptDataDoubleEscaped)
+ break
+ case TokeniserStateVars.eof:
+ t.eofError(self)
+ t.transition(.Data)
+ break
+ default:
+ t.emit(c)
+ t.transition(.ScriptDataDoubleEscaped)
+ }
+ break
+ case .ScriptDataDoubleEscapedLessthanSign:
+ if (r.matches("/")) {
+ t.emit("/")
+ t.createTempBuffer()
+ t.advanceTransition(.ScriptDataDoubleEscapeEnd)
+ } else {
+ t.transition(.ScriptDataDoubleEscaped)
+ }
+ break
+ case .ScriptDataDoubleEscapeEnd:
+ TokeniserState.handleDataDoubleEscapeTag(t, r, .ScriptDataEscaped, .ScriptDataDoubleEscaped)
+ break
+ case .BeforeAttributeName:
+ // from tagname
+ Start with one of the defaults:
+
+ If you need to allow more through (please be careful!), tweak a base whitelist with:
+
+ You can remove any setting from an existing whitelist with:
+
+ The cleaner and these whitelists assume that you want to clean a
+ If you are going to extend a whitelist, please be very careful. Make sure you understand what attributes may lead to
+ XSS attack vectors. URL attributes are particularly vulnerable and require careful validation. See
+ http://ha.ckers.org/xss.html for some XSS attack examples.
+
+ This whitelist allows a fuller range of text nodes:
+ Links (
+ Does not allow images.
+
+ Links do not have an enforced
+ E.g.:
+ To make an attribute valid for all tags, use the pseudo tag
+ E.g.:
+ To make an attribute invalid for all tags, use the pseudo tag
+ E.g.:
+ * Note that when handling relative links, the input document must have an appropriate {@code base URI} set when
+ * parsing, so that the link's protocol can be confirmed. Regardless of the setting of the {@code preserve relative
+ * links} option, the link must be resolvable against the base URI to an allowed protocol; otherwise the attribute
+ * will be removed.
+ *
+ E.g.:
+ To allow a link to an in-page URL anchor (i.e.
+ E.g.: Usage example: {@code Document xmlDoc = Jsoup.parse(html, baseUrl, Parser.xmlParser())}true
sets the attribute value to "" and
+ * marks the attribute as boolean so no value is written out. Setting to false
removes the attribute
+ * with the same key if it exists.
+ *
+ * @param attributeKey the attribute key
+ * @param attributeValue the attribute value
+ *
+ * @return this element
+ */
+ @discardableResult
+ open func attr(_ attributeKey: String, _ attributeValue: Bool)throws->Element {
+ try attributes?.put(attributeKey, attributeValue)
+ return self
+ }
+
+ /**
+ * Get this element's HTML5 custom data attributes. Each attribute in the element that has a key
+ * starting with "data-" is included the dataset.
+ *
Four
+ *
+ */
+ open func textNodes()->Array
]}
, " Four"]}
+ *
+ * <div class="header gray">
returns, "header gray
")
+ * @return The literal class attribute, or empty string if no class attribute set.
+ */
+ public func className()throws->String {
+ return try attr(Element.classString).trim()
+ }
+
+ /**
+ * Get all of the element's class names. E.g. on element {@code doc.select("b").wrap("<i></i>");
+ becomes {@code
+ * doc.select("p").empty();
+ * HTML = {@code
+ * doc.select("p").remove();
+ * HTML = {@code
+ * Elements divs = doc.select("div").not(".logo");
+ * Result: {@code divs: [null
if contents is empty.
+ */
+ open func first() -> Element? {
+ return isEmpty() ? nil : get(0)
+ }
+
+ /// Check if no element stored
+ open func isEmpty() -> Bool {
+ return array().count == 0
+ }
+
+ /// Count
+ open func size() -> Int {
+ return array().count
+ }
+
+ /**
+ Get the last matched element.
+ @return The last matched element, or null
if contents is empty.
+ */
+ open func last() -> Element? {
+ return isEmpty() ? nil : get(size() - 1)
+ }
+
+ /**
+ * Perform a depth-first traversal on each of the selected elements.
+ * @param nodeVisitor the visitor callbacks to perform on each node
+ * @return this, for chaining
+ */
+ @discardableResult
+ open func traverse(_ nodeVisitor: NodeVisitor)throws->Elements {
+ let traversor: NodeTraversor = NodeTraversor(nodeVisitor)
+ for el: Element in this {
+ try traversor.traverse(el)
+ }
+ return self
+ }
+
+ /**
+ * Get the {@link FormElement} forms from the selected elements, if any.
+ * @return a list of {@link FormElement}s pulled from the matched elements. The list will be empty if the elements contain
+ * no forms.
+ */
+ open func forms()->Array and processing fake end tag
+ return false
+ }
+ tb.generateImpliedEndTags()
+ if (!name.equals(tb.currentElement()?.nodeName())) {
+ tb.error(self)
+ }
+ tb.popStackToClose(name)
+ tb.clearFormattingElementsToLastMarker()
+ tb.transition(.InRow)
+ } else if let name = name, TagSets.tableMix7.contains(name) {
+ tb.error(self)
+ return false
+ } else if let name = name, TagSets.table.contains(name) {
+ if (try !tb.inTableScope(name)) {
+ tb.error(self)
+ return false
+ }
+ try closeCell(tb)
+ return try tb.process(t)
+ } else {
+ return try anythingElse(t, tb)
+ }
+ } else if let nName = t.startTagNormalName(), TagSets.tableRowsAndCols.contains(nName) {
+ if (try !(tb.inTableScope("td") || tb.inTableScope("th"))) {
+ tb.error(self)
+ return false
+ }
+ try closeCell(tb)
+ return try tb.process(t)
+ } else {
+ return try anythingElse(t, tb)
+ }
+ return true
+ case .InSelect:
+
+ func anythingElse(_ t: Token, _ tb: HtmlTreeBuilder) -> Bool {
+ tb.error(self)
+ return false
+ }
+
+ switch (t.type) {
+ case .Char:
+ let c: Token.Char = t.asCharacter()
+ if (HtmlTreeBuilderState.nullString.equals(c.getData())) {
+ tb.error(self)
+ return false
+ } else {
+ try tb.insert(c)
+ }
+ break
+ case .Comment:
+ try tb.insert(t.asComment())
+ break
+ case .Doctype:
+ tb.error(self)
+ return false
+ case .StartTag:
+ let start: Token.StartTag = t.asStartTag()
+ let name: String? = start.normalName()
+ if ("html".equals(name)) {
+ return try tb.process(start, .InBody)
+ } else if ("option".equals(name)) {
+ try tb.processEndTag("option")
+ try tb.insert(start)
+ } else if ("optgroup".equals(name)) {
+ if ("option".equals(tb.currentElement()?.nodeName())) {
+ try tb.processEndTag("option")
+ } else if ("optgroup".equals(tb.currentElement()?.nodeName())) {
+ try tb.processEndTag("optgroup")
+ }
+ try tb.insert(start)
+ } else if ("select".equals(name)) {
+ tb.error(self)
+ return try tb.processEndTag("select")
+ } else if let name = name, TagSets.inputKeygenTextarea.contains(name) {
+ tb.error(self)
+ if (try !tb.inSelectScope("select")) {
+ return false // frag
+ }
+ try tb.processEndTag("select")
+ return try tb.process(start)
+ } else if ("script".equals(name)) {
+ return try tb.process(t, .InHead)
+ } else {
+ return anythingElse(t, tb)
+ }
+ break
+ case .EndTag:
+ let end: Token.EndTag = t.asEndTag()
+ let name = end.normalName()
+ if ("optgroup".equals(name)) {
+ if ("option".equals(tb.currentElement()?.nodeName()) && tb.currentElement() != nil && tb.aboveOnStack(tb.currentElement()!) != nil && "optgroup".equals(tb.aboveOnStack(tb.currentElement()!)?.nodeName())) {
+ try tb.processEndTag("option")
+ }
+ if ("optgroup".equals(tb.currentElement()?.nodeName())) {
+ tb.pop()
+ } else {
+ tb.error(self)
+ }
+ } else if ("option".equals(name)) {
+ if ("option".equals(tb.currentElement()?.nodeName())) {
+ tb.pop()
+ } else {
+ tb.error(self)
+ }
+ } else if ("select".equals(name)) {
+ if (try !tb.inSelectScope(name!)) {
+ tb.error(self)
+ return false
+ } else {
+ tb.popStackToClose(name!)
+ tb.resetInsertionMode()
+ }
+ } else {
+ return anythingElse(t, tb)
+ }
+ break
+ case .EOF:
+ if (!"html".equals(tb.currentElement()?.nodeName())) {
+ tb.error(self)
+ }
+ break
+// default:
+// return anythingElse(t, tb)
+ }
+ return true
+ case .InSelectInTable:
+ if let nName = t.startTagNormalName(), TagSets.tableMix8.contains(nName) {
+ tb.error(self)
+ try tb.processEndTag("select")
+ return try tb.process(t)
+ } else if let nName = t.endTagNormalName(), TagSets.tableMix8.contains(nName) {
+ tb.error(self)
+ if try tb.inTableScope(nName) {
+ try tb.processEndTag("select")
+ return try (tb.process(t))
+ } else {
+ return false
+ }
+ } else {
+ return try tb.process(t, .InSelect)
+ }
+ case .AfterBody:
+ if (HtmlTreeBuilderState.isWhitespace(t)) {
+ return try tb.process(t, .InBody)
+ } else if (t.isComment()) {
+ try tb.insert(t.asComment()) // into html node
+ } else if (t.isDoctype()) {
+ tb.error(self)
+ return false
+ } else if t.startTagNormalName() == "html" {
+ return try tb.process(t, .InBody)
+ } else if t.endTagNormalName() == "html" {
+ if (tb.isFragmentParsing()) {
+ tb.error(self)
+ return false
+ } else {
+ tb.transition(.AfterAfterBody)
+ }
+ } else if (t.isEOF()) {
+ // chillax! we're done
+ } else {
+ tb.error(self)
+ tb.transition(.InBody)
+ return try tb.process(t)
+ }
+ return true
+ case .InFrameset:
+
+ if (HtmlTreeBuilderState.isWhitespace(t)) {
+ try tb.insert(t.asCharacter())
+ } else if (t.isComment()) {
+ try tb.insert(t.asComment())
+ } else if (t.isDoctype()) {
+ tb.error(self)
+ return false
+ } else if (t.isStartTag()) {
+ let start: Token.StartTag = t.asStartTag()
+ let name: String? = start.normalName()
+ if ("html".equals(name)) {
+ return try tb.process(start, .InBody)
+ } else if ("frameset".equals(name)) {
+ try tb.insert(start)
+ } else if ("frame".equals(name)) {
+ try tb.insertEmpty(start)
+ } else if ("noframes".equals(name)) {
+ return try tb.process(start, .InHead)
+ } else {
+ tb.error(self)
+ return false
+ }
+ } else if t.endTagNormalName() == "frameset" {
+ if ("html".equals(tb.currentElement()?.nodeName())) { // frag
+ tb.error(self)
+ return false
+ } else {
+ tb.pop()
+ if (!tb.isFragmentParsing() && !"frameset".equals(tb.currentElement()?.nodeName())) {
+ tb.transition(.AfterFrameset)
+ }
+ }
+ } else if (t.isEOF()) {
+ if (!"html".equals(tb.currentElement()?.nodeName())) {
+ tb.error(self)
+ return true
+ }
+ } else {
+ tb.error(self)
+ return false
+ }
+ return true
+ case .AfterFrameset:
+
+ if (HtmlTreeBuilderState.isWhitespace(t)) {
+ try tb.insert(t.asCharacter())
+ } else if (t.isComment()) {
+ try tb.insert(t.asComment())
+ } else if (t.isDoctype()) {
+ tb.error(self)
+ return false
+ } else if t.startTagNormalName() == "html" {
+ return try tb.process(t, .InBody)
+ } else if t.endTagNormalName() == "html" {
+ tb.transition(.AfterAfterFrameset)
+ } else if t.startTagNormalName() == "noframes" {
+ return try tb.process(t, .InHead)
+ } else if (t.isEOF()) {
+ // cool your heels, we're complete
+ } else {
+ tb.error(self)
+ return false
+ }
+ return true
+ case .AfterAfterBody:
+
+ if (t.isComment()) {
+ try tb.insert(t.asComment())
+ } else if (t.isDoctype() || HtmlTreeBuilderState.isWhitespace(t) || (t.isStartTag() && "html".equals(t.asStartTag().normalName()))) {
+ return try tb.process(t, .InBody)
+ } else if (t.isEOF()) {
+ // nice work chuck
+ } else {
+ tb.error(self)
+ tb.transition(.InBody)
+ return try tb.process(t)
+ }
+ return true
+ case .AfterAfterFrameset:
+
+ if (t.isComment()) {
+ try tb.insert(t.asComment())
+ } else if (t.isDoctype() || HtmlTreeBuilderState.isWhitespace(t) || (t.startTagNormalName() == "html")) {
+ return try tb.process(t, .InBody)
+ } else if (t.isEOF()) {
+ // nice work chuck
+ } else if t.startTagNormalName() == "noframes" {
+ return try tb.process(t, .InHead)
+ } else {
+ tb.error(self)
+ return false
+ }
+ return true
+ case .ForeignContent:
+ return true
+ // todo: implement. Also how do we get here?
+ }
+
+ }
+
+ private static func isWhitespace(_ t: Token) -> Bool {
+ if (t.isCharacter()) {
+ let data: String? = t.asCharacter().getData()
+ return isWhitespace(data)
+ }
+ return false
+ }
+
+ private static func isWhitespace(_ data: String?) -> Bool {
+ // todo: self checks more than spec - UnicodeScalar.BackslashT, "\n", "\f", "\r", " "
+ if let data = data {
+ for c in data {
+ if (!StringUtil.isWhitespace(c)) {
+ return false}
+ }
+ }
+ return true
+ }
+
+ private static func handleRcData(_ startTag: Token.StartTag, _ tb: HtmlTreeBuilder)throws {
+ try tb.insert(startTag)
+ tb.tokeniser.transition(TokeniserState.Rcdata)
+ tb.markInsertionMode()
+ tb.transition(.Text)
+ }
+
+ private static func handleRawtext(_ startTag: Token.StartTag, _ tb: HtmlTreeBuilder)throws {
+ try tb.insert(startTag)
+ tb.tokeniser.transition(TokeniserState.Rawtext)
+ tb.markInsertionMode()
+ tb.transition(.Text)
+ }
+
+ // lists of tags to search through. A little harder to read here, but causes less GC than dynamic varargs.
+ // was contributing around 10% of parse GC load.
+ fileprivate final class Constants {
+ fileprivate static let InBodyStartToHead: [String] = ["base", "basefont", "bgsound", "command", "link", "meta", "noframes", "script", "style", "title"]
+ fileprivate static let InBodyStartPClosers: [String] = ["address", "article", "aside", "blockquote", "center", "details", "dir", "div", "dl",
+ "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "menu", "nav", "ol",
+ "p", "section", "summary", "ul"]
+ fileprivate static let Headings: [String] = ["h1", "h2", "h3", "h4", "h5", "h6"]
+ fileprivate static let InBodyStartPreListing: [String] = ["pre", "listing"]
+ fileprivate static let InBodyStartLiBreakers: [String] = ["address", "div", "p"]
+ fileprivate static let DdDt: [String] = ["dd", "dt"]
+ fileprivate static let Formatters: [String] = ["b", "big", "code", "em", "font", "i", "s", "small", "strike", "strong", "tt", "u"]
+ fileprivate static let InBodyStartApplets: [String] = ["applet", "marquee", "object"]
+ fileprivate static let InBodyStartEmptyFormatters: [String] = ["area", "br", "embed", "img", "keygen", "wbr"]
+ fileprivate static let InBodyStartMedia: [String] = ["param", "source", "track"]
+ fileprivate static let InBodyStartInputAttribs: [String] = ["name", "action", "prompt"]
+ fileprivate static let InBodyStartOptions: [String] = ["optgroup", "option"]
+ fileprivate static let InBodyStartRuby: [String] = ["rp", "rt"]
+ fileprivate static let InBodyStartDrop: [String] = ["caption", "col", "colgroup", "frame", "head", "tbody", "td", "tfoot", "th", "thead", "tr"]
+ fileprivate static let InBodyEndClosers: [String] = ["address", "article", "aside", "blockquote", "button", "center", "details", "dir", "div",
+ "dl", "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "listing", "menu",
+ "nav", "ol", "pre", "section", "summary", "ul"]
+ fileprivate static let InBodyEndAdoptionFormatters: [String] = ["a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u"]
+ fileprivate static let InBodyEndTableFosters: [String] = ["table", "tbody", "tfoot", "thead", "tr"]
+ }
+}
+
+fileprivate extension Token {
+
+ func endTagNormalName() -> String? {
+ guard isEndTag() else { return nil }
+ return asEndTag().normalName()
+ }
+
+ func startTagNormalName() -> String? {
+ guard isStartTag() else { return nil }
+ return asStartTag().normalName()
+ }
+
+}
diff --git a/Swiftgram/SwiftSoup/Sources/HttpStatusException.swift b/Swiftgram/SwiftSoup/Sources/HttpStatusException.swift
new file mode 100644
index 00000000000..7d52dcac246
--- /dev/null
+++ b/Swiftgram/SwiftSoup/Sources/HttpStatusException.swift
@@ -0,0 +1,10 @@
+//
+// HttpStatusException.swift
+// SwifSoup
+//
+// Created by Nabil Chatbi on 29/09/16.
+// Copyright © 2016 Nabil Chatbi.. All rights reserved.
+//
+
+import Foundation
+//TODO:
diff --git a/Swiftgram/SwiftSoup/Sources/Info.plist b/Swiftgram/SwiftSoup/Sources/Info.plist
new file mode 100644
index 00000000000..bfe6ad8b1db
--- /dev/null
+++ b/Swiftgram/SwiftSoup/Sources/Info.plist
@@ -0,0 +1,26 @@
+
+
+ abs
,
+ * which is a shortcut to the {@link #absUrl} method.
+ *
+ *
+ * @param attributeKey The attribute key.
+ * @return The attribute, or empty string if not present (to avoid nulls).
+ * @see #attributes()
+ * @see #hasAttr(String)
+ * @see #absUrl(String)
+ */
+ open func attr(_ attributeKey: String)throws ->String {
+ let val: String = try attributes!.getIgnoreCase(key: attributeKey)
+ if (val.count > 0) {
+ return val
+ } else if (attributeKey.lowercased().startsWith(Node.abs)) {
+ return try absUrl(attributeKey.substring(Node.abs.count))
+ } else {return Node.empty}
+ }
+
+ /**
+ * Get all of the element's attributes.
+ * @return attributes (which implements iterable, in same order as presented in original HTML).
+ */
+ open func getAttributes() -> Attributes? {
+ return attributes
+ }
+
+ /**
+ * Set an attribute (key=value). If the attribute already exists, it is replaced.
+ * @param attributeKey The attribute key.
+ * @param attributeValue The attribute value.
+ * @return this (for chaining)
+ */
+ @discardableResult
+ open func attr(_ attributeKey: String, _ attributeValue: String)throws->Node {
+ try attributes?.put(attributeKey, attributeValue)
+ return self
+ }
+
+ /**
+ * Test if this element has an attribute. Case insensitive
+ * @param attributeKey The attribute key to check.
+ * @return true if the attribute exists, false if not.
+ */
+ open func hasAttr(_ attributeKey: String) -> Bool {
+ guard let attributes = attributes else {
+ return false
+ }
+ if (attributeKey.startsWith(Node.abs)) {
+ let key: String = attributeKey.substring(Node.abs.count)
+ do {
+ let abs = try absUrl(key)
+ if (attributes.hasKeyIgnoreCase(key: key) && !Node.empty.equals(abs)) {
+ return true
+ }
+ } catch {
+ return false
+ }
+
+ }
+ return attributes.hasKeyIgnoreCase(key: attributeKey)
+ }
+
+ /**
+ * Remove an attribute from this element.
+ * @param attributeKey The attribute to remove.
+ * @return this (for chaining)
+ */
+ @discardableResult
+ open func removeAttr(_ attributeKey: String)throws->Node {
+ try attributes?.removeIgnoreCase(key: attributeKey)
+ return self
+ }
+
+ /**
+ Get the base URI of this node.
+ @return base URI
+ */
+ open func getBaseUri() -> String {
+ return baseUri!
+ }
+
+ /**
+ Update the base URI of this node and all of its descendants.
+ @param baseUri base URI to set
+ */
+ open func setBaseUri(_ baseUri: String)throws {
+ class nodeVisitor: NodeVisitor {
+ private let baseUri: String
+ init(_ baseUri: String) {
+ self.baseUri = baseUri
+ }
+
+ func head(_ node: Node, _ depth: Int)throws {
+ node.baseUri = baseUri
+ }
+
+ func tail(_ node: Node, _ depth: Int)throws {
+ }
+ }
+ try traverse(nodeVisitor(baseUri))
+ }
+
+ /**
+ * Get an absolute URL from a URL attribute that may be relative (i.e. an String url = a.attr("abs:href");
<a href>
or
+ * <img src>
).
+ * String absUrl = linkEl.absUrl("href");
+ * http://
or https://
etc), and it successfully parses as a URL, the attribute is
+ * returned directly. Otherwise, it is treated as a URL relative to the element's {@link #baseUri}, and made
+ * absolute using that.
+ * abs:
prefix, e.g.:
+ * String absUrl = linkEl.attr("abs:href");
+ *
+// *
+// * @param url URL to connect to. The protocol must be {@code http} or {@code https}.
+// * @return the connection. You can add data, cookies, and headers; set the user-agent, referrer, method; and then execute.
+// */
+// public static Connection connect(String url) {
+// return HttpConnection.connect(url);
+// }
+
+ //todo:
+// /**
+// Parse the contents of a file as HTML.
+//
+// @param in file to load HTML from
+// @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
+// present, or fall back to {@code UTF-8} (which is often safe to do).
+// @param baseUri The URL where the HTML was retrieved from, to resolve relative links against.
+// @return sane HTML
+//
+// @throws IOException if the file could not be found, or read, or if the charsetName is invalid.
+// */
+// public static Document parse(File in, String charsetName, String baseUri) throws IOException {
+// return DataUtil.load(in, charsetName, baseUri);
+// }
+
+ //todo:
+// /**
+// Parse the contents of a file as HTML. The location of the file is used as the base URI to qualify relative URLs.
+//
+// @param in file to load HTML from
+// @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
+// present, or fall back to {@code UTF-8} (which is often safe to do).
+// @return sane HTML
+//
+// @throws IOException if the file could not be found, or read, or if the charsetName is invalid.
+// @see #parse(File, String, String)
+// */
+// public static Document parse(File in, String charsetName) throws IOException {
+// return DataUtil.load(in, charsetName, in.getAbsolutePath());
+// }
+
+// /**
+// Read an input stream, and parse it to a Document.
+//
+// @param in input stream to read. Make sure to close it after parsing.
+// @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
+// present, or fall back to {@code UTF-8} (which is often safe to do).
+// @param baseUri The URL where the HTML was retrieved from, to resolve relative links against.
+// @return sane HTML
+//
+// @throws IOException if the file could not be found, or read, or if the charsetName is invalid.
+// */
+// public static Document parse(InputStream in, String charsetName, String baseUri) throws IOException {
+// return DataUtil.load(in, charsetName, baseUri);
+// }
+
+// /**
+// Read an input stream, and parse it to a Document. You can provide an alternate parser, such as a simple XML
+// (non-HTML) parser.
+//
+// @param in input stream to read. Make sure to close it after parsing.
+// @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
+// present, or fall back to {@code UTF-8} (which is often safe to do).
+// @param baseUri The URL where the HTML was retrieved from, to resolve relative links against.
+// @param parser alternate {@link Parser#xmlParser() parser} to use.
+// @return sane HTML
+//
+// @throws IOException if the file could not be found, or read, or if the charsetName is invalid.
+// */
+// public static Document parse(InputStream in, String charsetName, String baseUri, Parser parser) throws IOException {
+// return DataUtil.load(in, charsetName, baseUri, parser);
+// }
+
+ /**
+ Parse a fragment of HTML, with the assumption that it forms the {@code body} of the HTML.
+
+ @param bodyHtml body HTML fragment
+ @param baseUri URL to resolve relative URLs against.
+ @return sane HTML document
+
+ @see Document#body()
+ */
+ public func parseBodyFragment(_ bodyHtml: String, _ baseUri: String)throws->Document {
+ return try Parser.parseBodyFragment(bodyHtml, baseUri)
+ }
+
+ /**
+ Parse a fragment of HTML, with the assumption that it forms the {@code body} of the HTML.
+
+ @param bodyHtml body HTML fragment
+ @return sane HTML document
+
+ @see Document#body()
+ */
+ public func parseBodyFragment(_ bodyHtml: String)throws->Document {
+ return try Parser.parseBodyFragment(bodyHtml, "")
+ }
+
+// /**
+// Fetch a URL, and parse it as HTML. Provided for compatibility; in most cases use {@link #connect(String)} instead.
+// Document doc = Jsoup.connect("http://example.com").userAgent("Mozilla").data("name", "jsoup").get();
Document doc = Jsoup.connect("http://example.com").cookie("auth", "token").post();
+
+
+
+
+
+
+ body
fragment of HTML (to add user
+ supplied HTML into a templated page), and not to clean a full HTML document. If the latter is the case, either wrap the
+ document HTML around the cleaned body HTML, or create a whitelist that allows html
and head
+ elements as appropriate.
+ b, em, i, strong, u
. All other HTML (tags and
+ attributes) will be removed.
+
+ @return whitelist
+ */
+ public static func simpleText()throws ->Whitelist {
+ return try Whitelist().addTags("b", "em", "i", "strong", "u")
+ }
+
+ /**
+ a, b, blockquote, br, cite, code, dd, dl, dt, em, i, li,
+ ol, p, pre, q, small, span, strike, strong, sub, sup, u, ul
, and appropriate attributes.
+ a
elements) can point to http, https, ftp, mailto
, and have an enforced
+ rel=nofollow
attribute.
+ img
tags, with appropriate
+ attributes, with src
pointing to http
or https
.
+
+ @return whitelist
+ */
+ public static func basicWithImages()throws->Whitelist {
+ return try basic()
+ .addTags("img")
+ .addAttributes("img", "align", "alt", "height", "src", "title", "width")
+ .addProtocols("img", "src", "http", "https")
+
+ }
+
+ /**
+ This whitelist allows a full range of text and structural body HTML: a, b, blockquote, br, caption, cite,
+ code, col, colgroup, dd, div, dl, dt, em, h1, h2, h3, h4, h5, h6, i, img, li, ol, p, pre, q, small, span, strike, strong, sub,
+ sup, table, tbody, td, tfoot, th, thead, tr, u, ul
+ rel=nofollow
attribute, but you can add that if desired.
+ addAttributes("a", "href", "class")
allows href
and class
attributes
+ on a
tags.
+ :all
, e.g.
+ addAttributes(":all", "class")
.
+ removeAttributes("a", "href", "class")
disallows href
and class
+ attributes on a
tags.
+ :all
, e.g.
+ removeAttributes(":all", "class")
.
+ addEnforcedAttribute("a", "rel", "nofollow")
will make all a
tags output as
+ <a href="..." rel="nofollow">
+ addProtocols("a", "href", "ftp", "http", "https")
+ <a href="#anchor">
, add a #
:
+ E.g.: addProtocols("a", "href", "#")
+ removeProtocols("a", "href", "ftp")
+