Skip to content

Commit

Permalink
Update HTML parser (#509)
Browse files Browse the repository at this point in the history
* Update HTML parser

* Fix the release script so as to delete files that we no longer release
  • Loading branch information
mjambon authored Sep 16, 2024
1 parent 86322f1 commit 73bf698
Show file tree
Hide file tree
Showing 7 changed files with 36 additions and 30 deletions.
3 changes: 3 additions & 0 deletions lang/release
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,9 @@ make -C "$lang" clean
make -C "$lang" GEN_OCAML_OPTIONS="$GEN_OCAML_OPTIONS" gen
dst="$export_dir"/"$repo"
if [[ -d "$lang"/ocaml-src ]]; then
# Make sure obsolete files are removed from the release repo
git -C "$dst" rm -rf lib bin

cp -a "$lang"/ocaml-src/lib "$dst"
cp -a "$lang"/ocaml-src/bin "$dst"
cp -a "$lang"/ocaml-src/.gitignore "$dst"
Expand Down
4 changes: 2 additions & 2 deletions lang/semgrep-grammars/src/semgrep-html/grammar.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
Extends the standard HTML grammar with Semgrep pattern constructs.
It now also extend the HTML grammar with XML constructs! See scanner_cc.diff
It now also extend the HTML grammar with XML constructs! See scanner_c.diff
for the extension to the scanner to support XML entity names (e.g., <f:bar></f:bar>).
An alternative would be to switch to https://github.com/unhammer/tree-sitter-xml,
but its grammar looks very complicated and the code is not maintained.
Expand Down Expand Up @@ -33,7 +33,7 @@ module.exports = grammar(base_grammar, {
// Hence the introduction of an extra _toplevel_node that does not
// allow toplevel text. Hopefully most HTML files have some
// toplevel elements (e.g., <html>) and not just text.
fragment: $ => choice(
document: $ => choice(
repeat($._toplevel_node),
$.toplevel_attribute,
),
Expand Down
8 changes: 4 additions & 4 deletions lang/semgrep-grammars/src/semgrep-html/prep
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,11 @@ mkdir -p src

(
cd src
rm -f scanner.cc
cp ../../tree-sitter-html/src/scanner.cc scanner.cc
rm -f scanner.cc scanner.c
cp ../../tree-sitter-html/src/scanner.c scanner.c
)
# to extend the HTML lexer to also accept XML tags
patch -p1 < scanner_cc.diff
patch -p1 < scanner_c.diff

(
cd src
Expand All @@ -25,5 +25,5 @@ mkdir -p test/corpus
(
cd test/corpus
rm -f inherited
ln -sf ../../../tree-sitter-html/corpus inherited
ln -sf ../../../tree-sitter-html/test/corpus inherited
)
19 changes: 19 additions & 0 deletions lang/semgrep-grammars/src/semgrep-html/scanner_c.diff
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
diff --git a/src/scanner.c b/src/scanner.c
index eecef9a..eddc236 100644
--- a/src/scanner.c
+++ b/src/scanner.c
@@ -102,7 +102,13 @@ static void deserialize(Scanner *scanner, const char *buffer, unsigned length) {

static String scan_tag_name(TSLexer *lexer) {
String tag_name = array_new();
- while (iswalnum(lexer->lookahead) || lexer->lookahead == '-' || lexer->lookahead == ':') {
+ while (iswalnum(lexer->lookahead) ||
+ // to accept xml names!
+ lexer->lookahead == '.' ||
+ lexer->lookahead == '_' ||
+ // original html-only name
+ lexer->lookahead == '-' ||
+ lexer->lookahead == ':') {
array_push(&tag_name, towupper(lexer->lookahead));
advance(lexer);
}
16 changes: 0 additions & 16 deletions lang/semgrep-grammars/src/semgrep-html/scanner_cc.diff

This file was deleted.

14 changes: 7 additions & 7 deletions lang/semgrep-grammars/src/semgrep-html/test/corpus/semgrep.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ Metavariable tag
<$X>foo</$X>
---

(fragment
(document
(element
(start_tag (semgrep_start_tag (semgrep_metavariable)))
(text)
Expand All @@ -17,7 +17,7 @@ Metavariable attribute

---

(fragment
(document
(element
(start_tag (tag_name)
(attribute (attribute_name) (quoted_attribute_value (attribute_value))))
Expand All @@ -31,7 +31,7 @@ Metavariable attribute value

---

(fragment
(document
(element
(start_tag (tag_name)
(attribute (attribute_name) (quoted_attribute_value (attribute_value))))
Expand All @@ -44,7 +44,7 @@ Metavariable body
<span>$BODY</span>

---
(fragment
(document
(element
(start_tag (tag_name))
(text)
Expand All @@ -57,7 +57,7 @@ Ellipsis in attributes and body
<script ... >...</script>

---
(fragment
(document
(script_element
(start_tag (tag_name) (attribute (attribute_name)))
(raw_text)
Expand All @@ -71,7 +71,7 @@ XML constructs
<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema"></xs:schema>

---
(fragment
(document
(xmldoctype
(attribute
(attribute_name)
Expand All @@ -96,7 +96,7 @@ Attribute pattern
===================================
foo="true"
---
(fragment
(document
(toplevel_attribute
(attribute_name)
(quoted_attribute_value
Expand Down
2 changes: 1 addition & 1 deletion lang/semgrep-grammars/src/tree-sitter-html
Submodule tree-sitter-html updated 56 files
+0 −22 .appveyor.yml
+39 −0 .editorconfig
+11 −1 .gitattributes
+8 −0 .github/dependabot.yml
+52 −0 .github/workflows/ci.yml
+21 −0 .github/workflows/fuzz.yml
+26 −0 .github/workflows/lint.yml
+23 −0 .github/workflows/release.yml
+37 −6 .gitignore
+0 −6 .npmignore
+0 −10 .travis.yml
+89 −0 Cargo.lock
+16 −12 Cargo.toml
+114 −0 Makefile
+60 −0 Package.swift
+16 −8 README.md
+18 −7 binding.gyp
+16 −0 bindings/c/tree-sitter-html.h
+11 −0 bindings/c/tree-sitter-html.pc.in
+13 −0 bindings/go/binding.go
+15 −0 bindings/go/binding_test.go
+14 −22 bindings/node/binding.cc
+9 −0 bindings/node/binding_test.js
+28 −0 bindings/node/index.d.ts
+3 −15 bindings/node/index.js
+11 −0 bindings/python/tests/test_binding.py
+34 −0 bindings/python/tree_sitter_html/__init__.py
+5 −0 bindings/python/tree_sitter_html/__init__.pyi
+27 −0 bindings/python/tree_sitter_html/binding.c
+0 −0 bindings/python/tree_sitter_html/py.typed
+12 −14 bindings/rust/build.rs
+20 −19 bindings/rust/lib.rs
+16 −0 bindings/swift/TreeSitterHTML/html.h
+12 −0 bindings/swift/TreeSitterHTMLTests/TreeSitterHTMLTests.swift
+9 −0 go.mod
+34 −0 go.sum
+39 −23 grammar.js
+1,478 −0 package-lock.json
+92 −18 package.json
+33 −0 pyproject.toml
+1 −0 queries/highlights.scm
+62 −0 setup.py
+10 −3 src/grammar.json
+47 −35 src/node-types.json
+1,320 −948 src/parser.c
+362 −0 src/scanner.c
+0 −310 src/scanner.cc
+358 −353 src/tag.h
+54 −0 src/tree_sitter/alloc.h
+290 −0 src/tree_sitter/array.h
+56 −13 src/tree_sitter/parser.h
+86 −16 test/corpus/main.txt
+39 −0 test/highlight/attributes.html
+6 −0 test/highlight/doctype.html
+7 −0 test/highlight/erroneous.html
+15 −0 test/highlight/self-closing.html

0 comments on commit 73bf698

Please sign in to comment.