From 26081edeea5ba2c40632cee3c2dc46d58d1bb70f Mon Sep 17 00:00:00 2001 From: Thilo Molitor Date: Mon, 14 Oct 2024 08:41:22 +0200 Subject: [PATCH] Implement swift-bridged html parser in rust --- Monal/Classes/MLOgHtmlParser.swift | 48 +- Monal/Classes/SwiftHelpers.swift | 42 +- Monal/Classes/chatViewController.m | 13 +- Monal/Monal.xcodeproj/project.pbxproj | 21 +- rust/Cargo.lock | 622 ++++++++++++++++++- rust/Cargo.toml | 1 + rust/build-rust.sh | 11 +- rust/monal-html-parser/Cargo.toml | 13 + rust/monal-html-parser/src/bin/parse_html.rs | 34 + rust/monal-html-parser/src/lib.rs | 38 ++ rust/monal-rust-swift-bridge/Cargo.toml | 1 + rust/monal-rust-swift-bridge/src/lib.rs | 15 + 12 files changed, 796 insertions(+), 63 deletions(-) create mode 100644 rust/monal-html-parser/Cargo.toml create mode 100644 rust/monal-html-parser/src/bin/parse_html.rs create mode 100644 rust/monal-html-parser/src/lib.rs diff --git a/Monal/Classes/MLOgHtmlParser.swift b/Monal/Classes/MLOgHtmlParser.swift index 828c72139b..dc61754578 100644 --- a/Monal/Classes/MLOgHtmlParser.swift +++ b/Monal/Classes/MLOgHtmlParser.swift @@ -6,40 +6,32 @@ // Copyright © 2022 Monal.im. All rights reserved. // -import SwiftSoup; - @objc class MLOgHtmlParser: NSObject { var og_title: String? var og_image_url: URL? @objc init(html: String, andBaseUrl baseUrl: URL?) { super.init() - do { - let parsedSite: Document = try SwiftSoup.parse(html) - - self.og_title = try parsedSite.select("meta[property=og:title]").first()?.attr("content") - if self.og_title == nil { - self.og_title = try parsedSite.select("html head title").first()?.text() - } - if self.og_title == nil { - DDLogWarn("Could not find any site title") - } - - if let image_url = try parsedSite.select("meta[property=og:image]").first()?.attr("content").removingPercentEncoding { - self.og_image_url = self.parseUrl(image_url, baseUrl) - } else if let image_url = try parsedSite.select("html head link[rel=apple-touch-icon]").first()?.attr("href").removingPercentEncoding { - self.og_image_url = self.parseUrl(image_url, baseUrl) - } else if let image_url = try parsedSite.select("html head link[rel=icon]").first()?.attr("href").removingPercentEncoding { - self.og_image_url = self.parseUrl(image_url, baseUrl) - } else if let image_url = try parsedSite.select("html head link[rel=shortcut icon]").first()?.attr("href").removingPercentEncoding { - self.og_image_url = self.parseUrl(image_url, baseUrl) - } else { - DDLogWarn("Could not find any site image") - } - } catch Exception.Error(let type, let message) { - DDLogWarn("Could not parse html og elements: \(message) type: \(type)") - } catch { - DDLogWarn("Could not parse html og elements: unhandled exception") + let parsedSite = HtmlParserBridge(html:html) + + self.og_title = try? parsedSite.select("meta[property=og:title]", attribute:"content").first + if self.og_title == nil { + self.og_title = try? parsedSite.select("html head title").first + } + if self.og_title == nil { + DDLogWarn("Could not find any site title") + } + + if let image_url = try? parsedSite.select("meta[property=og:image]", attribute:"content").first?.removingPercentEncoding { + self.og_image_url = self.parseUrl(image_url, baseUrl) + } else if let image_url = try? parsedSite.select("html head link[rel=apple-touch-icon]", attribute:"href").first?.removingPercentEncoding { + self.og_image_url = self.parseUrl(image_url, baseUrl) + } else if let image_url = try? parsedSite.select("html head link[rel=icon]", attribute:"href").first?.removingPercentEncoding { + self.og_image_url = self.parseUrl(image_url, baseUrl) + } else if let image_url = try? parsedSite.select("html head link[rel=shortcut icon]", attribute:"href").first?.removingPercentEncoding { + self.og_image_url = self.parseUrl(image_url, baseUrl) + } else { + DDLogWarn("Could not find any site image in html") } } diff --git a/Monal/Classes/SwiftHelpers.swift b/Monal/Classes/SwiftHelpers.swift index 686c1b2bba..eb5bd0f617 100644 --- a/Monal/Classes/SwiftHelpers.swift +++ b/Monal/Classes/SwiftHelpers.swift @@ -434,6 +434,32 @@ public class SwiftHelpers: NSObject { } } +//TODO: remove this +extension UIImage { + public func thumbnail(size: CGSize) -> UIImage? { + UIGraphicsBeginImageContextWithOptions(size, false, 0.0) + defer { UIGraphicsEndImageContext() } + draw(in: CGRect(origin: .zero, size: size)) + return UIGraphicsGetImageFromCurrentImageContext() + } +} + +// ********************************************** +// **************** rust bridges **************** +// ********************************************** + +fileprivate extension RustVec { + func intoArray() -> [T] { + var array: [T] = [] + for _ in 0.. UIImage? { - UIGraphicsBeginImageContextWithOptions(size, false, 0.0) - defer { UIGraphicsEndImageContext() } - draw(in: CGRect(origin: .zero, size: size)) - return UIGraphicsGetImageFromCurrentImageContext() +@objcMembers +public class HtmlParserBridge : NSObject { + var document: MonalHtmlParser + + public init(html: String) { + self.document = MonalHtmlParser(html) + } + + public func select(_ selector: String, attribute: String? = nil) throws -> [String] { + return self.document.select(selector, attribute).intoArray().map { $0.toString() } } } diff --git a/Monal/Classes/chatViewController.m b/Monal/Classes/chatViewController.m index ee419da86c..3242d924ed 100644 --- a/Monal/Classes/chatViewController.m +++ b/Monal/Classes/chatViewController.m @@ -3010,7 +3010,7 @@ -(void) loadPreviewWithUrlForRow:(NSIndexPath *) indexPath withResultHandler:(mo return; } //limit to 512KB of html - if(contentLength.intValue > 65536) + if(contentLength.intValue > 524288) { DDLogWarn(@"Now loading preview HTML for %@ with byte range 0-512k...", row.url); [self downloadPreviewWithRow:indexPath usingByterange:YES andResultHandler:resultHandler]; @@ -3050,7 +3050,7 @@ -(void) downloadPreviewWithRow:(NSIndexPath*) indexPath usingByterange:(BOOL) us request.requiresDNSSECValidation = YES; [request setValue:@"facebookexternalhit/1.1" forHTTPHeaderField:@"User-Agent"]; //required on some sites for og tags e.g. youtube if(useByterange) - [request setValue:@"bytes=0-65536" forHTTPHeaderField:@"Range"]; + [request setValue:@"bytes=0-524288" forHTTPHeaderField:@"Range"]; request.timeoutInterval = 10; NSURLSession* session = [HelperTools createEphemeralURLSession]; [[session dataTaskWithRequest:request completionHandler:^(NSData* _Nullable data, NSURLResponse* _Nullable response, NSError* _Nullable error) { @@ -3062,11 +3062,10 @@ -(void) downloadPreviewWithRow:(NSIndexPath*) indexPath usingByterange:(BOOL) us MLOgHtmlParser* ogParser = nil; NSString* text = nil; NSURL* image = nil; - if([body length] <= 65536) - { - NSURL* baseURL = [NSURL URLWithString:[NSString stringWithFormat:@"%@://%@%@", row.url.scheme, row.url.host, row.url.path]]; - ogParser = [[MLOgHtmlParser alloc] initWithHtml:body andBaseUrl:baseURL]; - } + if([body length] > 524288) + body = [body substringToIndex:524288]; + NSURL* baseURL = [NSURL URLWithString:[NSString stringWithFormat:@"%@://%@%@", row.url.scheme, row.url.host, row.url.path]]; + ogParser = [[MLOgHtmlParser alloc] initWithHtml:body andBaseUrl:baseURL]; if(ogParser != nil) { text = [ogParser getOgTitle]; diff --git a/Monal/Monal.xcodeproj/project.pbxproj b/Monal/Monal.xcodeproj/project.pbxproj index 216afa7fca..995ec1b6e2 100644 --- a/Monal/Monal.xcodeproj/project.pbxproj +++ b/Monal/Monal.xcodeproj/project.pbxproj @@ -208,7 +208,6 @@ C1C839DE24F15DF800BBCF17 /* MLOMEMO.m in Sources */ = {isa = PBXBuildFile; fileRef = C1C839DC24F15DF800BBCF17 /* MLOMEMO.m */; }; C1D7D7AF283FB4E500401389 /* Images.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 26B2A4BA1B73061400272E63 /* Images.xcassets */; }; C1D7D7B0283FB4E700401389 /* Media.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 26470F511835C4080069E3E0 /* Media.xcassets */; }; - C1E1EC7B286A025F0097EC74 /* SwiftSoup in Frameworks */ = {isa = PBXBuildFile; productRef = C1E1EC7A286A025F0097EC74 /* SwiftSoup */; }; C1E4654824EE517000CA5AAF /* Localizable.strings in Resources */ = {isa = PBXBuildFile; fileRef = C1E4654624EE517000CA5AAF /* Localizable.strings */; }; C1E8A7F72B8E47C300760220 /* EditGroupSubject.swift in Sources */ = {isa = PBXBuildFile; fileRef = C1E8A7F62B8E47C300760220 /* EditGroupSubject.swift */; }; C1F5C7A92775DA000001F295 /* MLContactSoftwareVersionInfo.h in Headers */ = {isa = PBXBuildFile; fileRef = C1F5C7A72775DA000001F295 /* MLContactSoftwareVersionInfo.h */; }; @@ -788,7 +787,6 @@ files = ( 8418B5672C87E0ED006FAF60 /* ExyteChat in Frameworks */, 261E542523A0A1D300394F59 /* monalxmpp.framework in Frameworks */, - C1E1EC7B286A025F0097EC74 /* SwiftSoup in Frameworks */, 84F194D12C15197200F0A994 /* FrameUp in Frameworks */, C176F1EC2AF11C31002034E5 /* UserNotifications.framework in Frameworks */, C1F5C7AF2777638B0001F295 /* OrderedCollections in Frameworks */, @@ -1547,7 +1545,6 @@ name = Monal; packageProductDependencies = ( C1F5C7AE2777638B0001F295 /* OrderedCollections */, - C1E1EC7A286A025F0097EC74 /* SwiftSoup */, 841898A92957712000FEC77D /* ViewExtractor */, 84F194D02C15197200F0A994 /* FrameUp */, 8418B5662C87E0ED006FAF60 /* ExyteChat */, @@ -1763,7 +1760,6 @@ mainGroup = 29B97314FDCFA39411CA2CEA /* CustomTemplate */; packageReferences = ( C1F5C7AD2777638B0001F295 /* XCRemoteSwiftPackageReference "swift-collections" */, - C1E1EC79286A025F0097EC74 /* XCRemoteSwiftPackageReference "SwiftSoup" */, 841898A82957712000FEC77D /* XCRemoteSwiftPackageReference "ViewExtractor" */, 849ADF3D2BACF0360009BCD7 /* XCRemoteSwiftPackageReference "cocoalumberjack" */, 84F194CF2C15197200F0A994 /* XCRemoteSwiftPackageReference "FrameUp" */, @@ -4688,16 +4684,8 @@ isa = XCRemoteSwiftPackageReference; repositoryURL = "https://github.com/ryanlintott/FrameUp"; requirement = { - kind = upToNextMajorVersion; - minimumVersion = 0.8.0; - }; - }; - C1E1EC79286A025F0097EC74 /* XCRemoteSwiftPackageReference "SwiftSoup" */ = { - isa = XCRemoteSwiftPackageReference; - repositoryURL = "https://github.com/scinfu/SwiftSoup.git"; - requirement = { - kind = upToNextMajorVersion; - minimumVersion = 2.7.5; + kind = exactVersion; + version = 0.8.0; }; }; C1F5C7AD2777638B0001F295 /* XCRemoteSwiftPackageReference "swift-collections" */ = { @@ -4750,11 +4738,6 @@ package = 84F194CF2C15197200F0A994 /* XCRemoteSwiftPackageReference "FrameUp" */; productName = FrameUp; }; - C1E1EC7A286A025F0097EC74 /* SwiftSoup */ = { - isa = XCSwiftPackageProductDependency; - package = C1E1EC79286A025F0097EC74 /* XCRemoteSwiftPackageReference "SwiftSoup" */; - productName = SwiftSoup; - }; C1F5C7AE2777638B0001F295 /* OrderedCollections */ = { isa = XCSwiftPackageProductDependency; package = C1F5C7AD2777638B0001F295 /* XCRemoteSwiftPackageReference "swift-collections" */; diff --git a/rust/Cargo.lock b/rust/Cargo.lock index e37417927b..2926d1cbac 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -1,6 +1,74 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 3 +version = 4 + +[[package]] +name = "ahash" +version = "0.8.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" +dependencies = [ + "cfg-if", + "getrandom", + "once_cell", + "version_check", + "zerocopy", +] + +[[package]] +name = "anstream" +version = "0.6.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64e15c1ab1f89faffbf04a634d5e1962e9074f2741eef6d97f3c4e322426d526" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bec1de6f59aedf83baf9ff929c98f2ad654b97c9510f4e70cf6f661d49fd5b1" + +[[package]] +name = "anstyle-parse" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb47de1e80c2b463c735db5b217a0ddc39d612e7ac9e2e96a5aed1f57616c1cb" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d36fc52c7f6c869915e99412912f22093507da8d9e942ceaf66fe4b7c14422a" +dependencies = [ + "windows-sys 0.52.0", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5bf74e1b6e971609db8ca7a9ce79fd5768ab6ae46441c572e46cf596f59e57f8" +dependencies = [ + "anstyle", + "windows-sys 0.52.0", +] + +[[package]] +name = "autocfg" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" [[package]] name = "bitflags" @@ -8,12 +76,119 @@ version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + [[package]] name = "cfg-if" version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "clap" +version = "4.5.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b97f376d85a664d5837dbae44bf546e6477a679ff6610010f17276f686d867e8" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.5.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19bc80abd44e4bed93ca373a0704ccbd1b710dc5749406201bb018272808dc54" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.5.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn 2.0.79", +] + +[[package]] +name = "clap_lex" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97" + +[[package]] +name = "colorchoice" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3fd119d74b830634cea2a0f58bbd0d54540518a14397557951e79340abc28c0" + +[[package]] +name = "cssparser" +version = "0.31.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b3df4f93e5fbbe73ec01ec8d3f68bba73107993a5b1e7519273c32db9b0d5be" +dependencies = [ + "cssparser-macros", + "dtoa-short", + "itoa", + "phf 0.11.2", + "smallvec", +] + +[[package]] +name = "cssparser-macros" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331" +dependencies = [ + "quote", + "syn 2.0.79", +] + +[[package]] +name = "derive_more" +version = "0.99.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f33878137e4dafd7fa914ad4e259e18a4e8e532b9617a2d0150262bf53abfce" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.79", +] + +[[package]] +name = "dtoa" +version = "1.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcbb2bf8e87535c23f7a8a321e364ce21462d0ff10cb6407820e8e96dfff6653" + +[[package]] +name = "dtoa-short" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd1511a7b6a56299bd043a9c167a6d2bfb37bf84a6dfceaba651168adfb43c87" +dependencies = [ + "dtoa", +] + +[[package]] +name = "ego-tree" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "12a0bb14ac04a9fcf170d0bbbef949b44cc492f4452bd20c095636956f653642" + [[package]] name = "errno" version = "0.3.9" @@ -39,6 +214,65 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "futf" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843" +dependencies = [ + "mac", + "new_debug_unreachable", +] + +[[package]] +name = "fxhash" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" +dependencies = [ + "byteorder", +] + +[[package]] +name = "getopts" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14dbbfd5c71d70241ecf9e6f13737f7b5ce823821063188d7e46c41d371eebd5" +dependencies = [ + "unicode-width", +] + +[[package]] +name = "getrandom" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "html5ever" +version = "0.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c13771afe0e6e846f1e67d038d4cb29998a6779f93c809212e4e9c32efd244d4" +dependencies = [ + "log", + "mac", + "markup5ever", + "proc-macro2", + "quote", + "syn 2.0.79", +] + [[package]] name = "idna" version = "0.5.0" @@ -49,6 +283,18 @@ dependencies = [ "unicode-normalization", ] +[[package]] +name = "is_terminal_polyfill" +version = "1.70.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" + +[[package]] +name = "itoa" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" + [[package]] name = "libc" version = "0.2.159" @@ -61,18 +307,56 @@ version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" +[[package]] +name = "lock_api" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17" +dependencies = [ + "autocfg", + "scopeguard", +] + [[package]] name = "log" version = "0.4.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" +[[package]] +name = "mac" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" + +[[package]] +name = "markup5ever" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16ce3abbeba692c8b8441d036ef91aea6df8da2c6b6e21c7e14d3c18e526be45" +dependencies = [ + "log", + "phf 0.11.2", + "phf_codegen 0.11.2", + "string_cache", + "string_cache_codegen", + "tendril", +] + [[package]] name = "memchr" version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" +[[package]] +name = "monal-html-parser" +version = "0.1.0" +dependencies = [ + "clap", + "scraper", +] + [[package]] name = "monal-panic-handler" version = "0.1.0" @@ -81,24 +365,159 @@ version = "0.1.0" name = "monal-rust-swift-bridge" version = "0.1.0" dependencies = [ + "monal-html-parser", "monal-panic-handler", "sdp-to-jingle", "swift-bridge", "swift-bridge-build", ] +[[package]] +name = "new_debug_unreachable" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086" + [[package]] name = "once_cell" version = "1.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" +[[package]] +name = "parking_lot" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-targets", +] + [[package]] name = "percent-encoding" version = "2.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" +[[package]] +name = "phf" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259" +dependencies = [ + "phf_shared 0.10.0", +] + +[[package]] +name = "phf" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc" +dependencies = [ + "phf_macros", + "phf_shared 0.11.2", +] + +[[package]] +name = "phf_codegen" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd" +dependencies = [ + "phf_generator 0.10.0", + "phf_shared 0.10.0", +] + +[[package]] +name = "phf_codegen" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8d39688d359e6b34654d328e262234662d16cc0f60ec8dcbe5e718709342a5a" +dependencies = [ + "phf_generator 0.11.2", + "phf_shared 0.11.2", +] + +[[package]] +name = "phf_generator" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6" +dependencies = [ + "phf_shared 0.10.0", + "rand", +] + +[[package]] +name = "phf_generator" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48e4cc64c2ad9ebe670cb8fd69dd50ae301650392e81c05f9bfcb2d5bdbc24b0" +dependencies = [ + "phf_shared 0.11.2", + "rand", +] + +[[package]] +name = "phf_macros" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3444646e286606587e49f3bcf1679b8cef1dc2c5ecc29ddacaffc305180d464b" +dependencies = [ + "phf_generator 0.11.2", + "phf_shared 0.11.2", + "proc-macro2", + "quote", + "syn 2.0.79", +] + +[[package]] +name = "phf_shared" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096" +dependencies = [ + "siphasher", +] + +[[package]] +name = "phf_shared" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90fcb95eef784c2ac79119d1dd819e162b5da872ce6f3c3abe1e8ca1c082f72b" +dependencies = [ + "siphasher", +] + +[[package]] +name = "ppv-lite86" +version = "0.2.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04" +dependencies = [ + "zerocopy", +] + +[[package]] +name = "precomputed-hash" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" + [[package]] name = "proc-macro2" version = "1.0.87" @@ -127,6 +546,45 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom", +] + +[[package]] +name = "redox_syscall" +version = "0.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b6dfecf2c74bce2466cabf93f6664d6998a69eb21e39f4207930065b27b771f" +dependencies = [ + "bitflags", +] + [[package]] name = "rustix" version = "0.38.37" @@ -140,6 +598,28 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "scraper" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b90460b31bfe1fc07be8262e42c665ad97118d4585869de9345a84d501a9eaf0" +dependencies = [ + "ahash", + "cssparser", + "ego-tree", + "getopts", + "html5ever", + "once_cell", + "selectors", + "tendril", +] + [[package]] name = "sdp-to-jingle" version = "0.1.0" @@ -150,6 +630,25 @@ dependencies = [ "webrtc-sdp", ] +[[package]] +name = "selectors" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4eb30575f3638fc8f6815f448d50cb1a2e255b0897985c8c59f4d37b72a07b06" +dependencies = [ + "bitflags", + "cssparser", + "derive_more", + "fxhash", + "log", + "new_debug_unreachable", + "phf 0.10.1", + "phf_codegen 0.10.0", + "precomputed-hash", + "servo_arc", + "smallvec", +] + [[package]] name = "serde" version = "1.0.210" @@ -170,6 +669,65 @@ dependencies = [ "syn 2.0.79", ] +[[package]] +name = "servo_arc" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d036d71a959e00c77a63538b90a6c2390969f9772b096ea837205c6bd0491a44" +dependencies = [ + "stable_deref_trait", +] + +[[package]] +name = "siphasher" +version = "0.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" + +[[package]] +name = "smallvec" +version = "1.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" + +[[package]] +name = "stable_deref_trait" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" + +[[package]] +name = "string_cache" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f91138e76242f575eb1d3b38b4f1362f10d3a43f47d182a5b359af488a02293b" +dependencies = [ + "new_debug_unreachable", + "once_cell", + "parking_lot", + "phf_shared 0.10.0", + "precomputed-hash", + "serde", +] + +[[package]] +name = "string_cache_codegen" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6bb30289b722be4ff74a408c3cc27edeaad656e06cb1fe8fa9231fa59c728988" +dependencies = [ + "phf_generator 0.10.0", + "phf_shared 0.10.0", + "proc-macro2", + "quote", +] + +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + [[package]] name = "swift-bridge" version = "0.1.57" @@ -250,6 +808,17 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "tendril" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0" +dependencies = [ + "futf", + "mac", + "utf-8", +] + [[package]] name = "tinyvec" version = "1.8.0" @@ -286,6 +855,12 @@ dependencies = [ "tinyvec", ] +[[package]] +name = "unicode-width" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af" + [[package]] name = "url" version = "2.5.2" @@ -297,6 +872,30 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "utf-8" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" + +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + [[package]] name = "webrtc-sdp" version = "0.3.13" @@ -390,3 +989,24 @@ name = "windows_x86_64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "zerocopy" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" +dependencies = [ + "byteorder", + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.79", +] diff --git a/rust/Cargo.toml b/rust/Cargo.toml index 9096dd029f..da7bf3c383 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -4,6 +4,7 @@ members = [ "sdp-to-jingle", "monal-panic-handler", "monal-rust-swift-bridge", + "monal-html-parser", ] resolver = "2" \ No newline at end of file diff --git a/rust/build-rust.sh b/rust/build-rust.sh index e1d2f0b630..9d7e465789 100644 --- a/rust/build-rust.sh +++ b/rust/build-rust.sh @@ -12,6 +12,8 @@ then source ~/.cargo/env fi +echo "Installing required components" + rustup +nightly component add rust-src cargo install swift-bridge-cli #rustup component add rust-src --toolchain x86_64-apple-ios-macabi @@ -19,7 +21,7 @@ cargo install swift-bridge-cli rustup target add aarch64-apple-ios x86_64-apple-ios aarch64-apple-ios-sim -# Build the project for the desired platforms: +echo "Building stdlib for the desired platforms..." #cargo build --target x86_64-apple-darwin #cargo build --target aarch64-apple-darwin cargo +nightly build --verbose -Z build-std --target x86_64-apple-ios-macabi @@ -27,6 +29,7 @@ cargo +nightly build --verbose -Z build-std --target aarch64-apple-ios-macabi BRIDGE_NAME=libmonal_rust_swift_bridge.a +echo "Creating catalyst target universal lib..." mkdir -p ./target/catalyst-macos/debug lipo \ ./target/x86_64-apple-ios-macabi/debug/$BRIDGE_NAME \ @@ -34,16 +37,20 @@ lipo \ -create -output \ ./target/catalyst-macos/debug/$BRIDGE_NAME +echo "Building rust code for all targets..." cargo build --target aarch64-apple-ios cargo build --target x86_64-apple-ios cargo build --target aarch64-apple-ios-sim +echo "Creating ios target universal lib..." mkdir -p ./target/universal-ios/debug lipo \ ./target/aarch64-apple-ios-sim/debug/$BRIDGE_NAME \ - ./target/x86_64-apple-ios/debug/$BRIDGE_NAME -create -output \ + ./target/x86_64-apple-ios/debug/$BRIDGE_NAME \ + -create -output \ ./target/universal-ios/debug/$BRIDGE_NAME +echo "Creating swift package..." swift-bridge-cli create-package \ --bridges-dir ./monal-rust-swift-bridge/generated \ --out-dir LibMonalRustSwiftBridge \ diff --git a/rust/monal-html-parser/Cargo.toml b/rust/monal-html-parser/Cargo.toml new file mode 100644 index 0000000000..e17af047a2 --- /dev/null +++ b/rust/monal-html-parser/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "monal-html-parser" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[lib] +crate-type = ["staticlib", "lib"] + +[dependencies] +clap = {version = "4.5.20", features = ["derive"]} +scraper = "0.20.0" diff --git a/rust/monal-html-parser/src/bin/parse_html.rs b/rust/monal-html-parser/src/bin/parse_html.rs new file mode 100644 index 0000000000..7acc52f10a --- /dev/null +++ b/rust/monal-html-parser/src/bin/parse_html.rs @@ -0,0 +1,34 @@ +use clap::Parser; +use std::fs; +use std::io::Read; + +use monal_html_parser::MonalHtmlParser; + +/// Parse the given html file for text contents or attributes of given selector +#[derive(Parser)] +struct Cli { + /// The path to the file to read (use '-' for stdin) + path: std::path::PathBuf, + /// The selector to look for + selector: String, + /// An optional attribute name to return (omit to return text contents) + attribute: Option, +} + +fn main() -> Result<(), Box> { + let args = Cli::parse(); + println!( + "path: {:?}, selector: {:?}, attribute: {:?}", + args.path, args.selector, args.attribute + ); + let mut html = String::new(); + if args.path.as_os_str().to_str() == Some("-") { + std::io::stdin().lock().read_to_string(&mut html)?; + } else { + html = fs::read_to_string(args.path)?; + } + let parser = MonalHtmlParser::new(html); + let found = parser.select(args.selector, args.attribute); + println!("result: {:?}", found); + Ok(()) +} diff --git a/rust/monal-html-parser/src/lib.rs b/rust/monal-html-parser/src/lib.rs new file mode 100644 index 0000000000..c6f91ae7eb --- /dev/null +++ b/rust/monal-html-parser/src/lib.rs @@ -0,0 +1,38 @@ +use scraper::{Html, Selector}; + +pub struct MonalHtmlParser { + document: Html, +} + +impl MonalHtmlParser { + pub fn new(html: String) -> Self { + let document = Html::parse_document(&html); + MonalHtmlParser { document } + } + + pub fn select( + &self, + selector: String, + atrribute: Option, + ) -> Vec { + let mut retval = Vec::new(); + let sel = match Selector::parse(&selector) { + Ok(value) => value, + Err(error) => { + eprintln!("Selector '{selector}' parse error: {error}"); + return retval; + } + }; + for element in self.document.select(&sel) { + match atrribute { + Some(ref attr) => { + if let Some(val) = element.attr(attr) { + retval.push(val.to_string()) + } + } + None => retval.push(element.text().map(String::from).collect()), + }; + } + retval + } +} diff --git a/rust/monal-rust-swift-bridge/Cargo.toml b/rust/monal-rust-swift-bridge/Cargo.toml index 5ccb868c98..df789dfd9b 100644 --- a/rust/monal-rust-swift-bridge/Cargo.toml +++ b/rust/monal-rust-swift-bridge/Cargo.toml @@ -12,6 +12,7 @@ crate-type = ["staticlib"] swift-bridge = "0.1" sdp-to-jingle = { path = "../sdp-to-jingle" } monal-panic-handler = { path = "../monal-panic-handler" } +monal-html-parser = { path = "../monal-html-parser" } [build-dependencies] diff --git a/rust/monal-rust-swift-bridge/src/lib.rs b/rust/monal-rust-swift-bridge/src/lib.rs index cf4f479f82..37faf36bb3 100644 --- a/rust/monal-rust-swift-bridge/src/lib.rs +++ b/rust/monal-rust-swift-bridge/src/lib.rs @@ -1,7 +1,9 @@ use crate::ffi::rust_panic_handler; +use monal_html_parser::MonalHtmlParser; #[swift_bridge::bridge] mod ffi { + //simple functions exported from rust to swift extern "Rust" { pub fn install_panichandler(); pub fn trigger_panic(); @@ -9,6 +11,19 @@ mod ffi { pub fn jingle_str_to_sdp_str(jingle_str: String, initiator: bool) -> Option; } + //rust struct exported from rust to swift + extern "Rust" { + type MonalHtmlParser; + #[swift_bridge(init)] + pub fn new(html: String) -> MonalHtmlParser; + pub fn select( + &self, + selector: String, + atrribute: Option, + ) -> Vec; + } + + //exported from our internal swift helper to rust extern "Swift" { fn rust_panic_handler(text: String, backtrace: String); }