glowingjade · glowingjade · Oct 14, 2024 · Oct 12, 2024 · Oct 13, 2024
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -50,6 +50,7 @@
     "lodash.isequal": "^4.5.0",
     "lucide-react": "^0.447.0",
     "openai": "^4.65.0",
+    "parse5": "^7.1.2",
     "react": "^18.3.1",
     "react-dom": "^18.3.1",
     "react-markdown": "^9.0.1",

diff --git a/src/components/ReactMarkdown.tsx b/src/components/ReactMarkdown.tsx
@@ -1,56 +1,11 @@
 import Markdown from 'react-markdown'
 
-import MarkdownCodeComponent from './MarkdownCodeComponent'
-
-function parsesmtcmpBlocks(input: string): (
-  | { type: 'string'; content: string }
-  | {
-      type: 'smtcmpBlock'
-      content: string
-      language?: string
-      filename?: string
-    }
-)[] {
-  const regex = /<smtcmpBlock([^>]*)>\s*([\s\S]*?)\s*(?:<\/smtcmpBlock>|$)/g
-  const matches = input.matchAll(regex)
-  const result: (
-    | { type: 'string'; content: string }
-    | {
-        type: 'smtcmpBlock'
-        content: string
-        language?: string
-        filename?: string
-      }
-  )[] = []
-
-  let lastIndex = 0
-  for (const match of matches) {
-    if (match.index > lastIndex) {
-      result.push({
-        type: 'string',
-        content: input.slice(lastIndex, match.index),
-      })
-    }
-    const [, attributes, content] = match
-    const language = attributes.match(/language="([^"]+)"/)?.[1]
-    const filename = attributes.match(/filename="([^"]+)"/)?.[1]
-    result.push({
-      type: 'smtcmpBlock',
-      content,
-      language,
-      filename,
-    })
-    lastIndex = match.index + match[0].length
-  }
-  if (lastIndex < input.length) {
-    result.push({
-      type: 'string',
-      content: input.slice(lastIndex),
-    })
-  }
+import {
+  ParsedSmtcmpBlock,
+  parsesmtcmpBlocks,
+} from '../utils/parse-smtcmp-block'
 
-  return result
-}
+import MarkdownCodeComponent from './MarkdownCodeComponent'
 
 export default function ReactMarkdown({
   onApply,
@@ -61,8 +16,7 @@ export default function ReactMarkdown({
   children: string
   isApplying: boolean
 }) {
-  const blocks = parsesmtcmpBlocks(children)
-
+  const blocks: ParsedSmtcmpBlock[] = parsesmtcmpBlocks(children)
   return (
     <>
       {blocks.map((block, index) =>

diff --git a/src/utils/apply.ts b/src/utils/apply.ts
@@ -10,14 +10,14 @@ const systemPrompt = `You are an intelligent assistant helping a user apply chan
 
 You will receive:
 1. The content of the target markdown file.
-2. A conversation history between the user and the assistant. This conversation may contain multiple markdown blocks suggesting changes to the file. Markdown blocks are indicated by the <smtcmpBlock> tag. For example:
-<smtcmpBlock>
+2. A conversation history between the user and the assistant. This conversation may contain multiple markdown blocks suggesting changes to the file. Markdown blocks are indicated by the <smtcmp_block> tag. For example:
+<smtcmp_block>
 <!-- ... existing content ... -->
 {{ edit_1 }}
 <!-- ... existing content ... -->
 {{ edit_2 }}
 <!-- ... existing content ... -->
-</smtcmpBlock>
+</smtcmp_block>
 3. A single, specific markdown block extracted from the conversation history. This block contains the exact changes that should be applied to the target file.
 
 Please rewrite the entire markdown file with ONLY the changes from the specified markdown block applied. DO NOT apply changes suggested by other parts of the conversation. Preserve all parts of the original file that are not related to the changes. Output only the file content, without any additional words or explanations.`
@@ -72,9 +72,9 @@ ${chatMessages
 
 ## Changes to Apply
 Here is the markdown block that indicates where content changes should be applied.
-<smtcmpBlock>
+<smtcmp_block>
 ${blockToApply}
-</smtcmpBlock>
+</smtcmp_block>
 
 Now rewrite the entire file with the changes applied. Immediately start your response with \`\`\`${currentFile.path}`
 }

diff --git a/src/utils/parse-smtcmp-block.test.ts b/src/utils/parse-smtcmp-block.test.ts
@@ -0,0 +1,158 @@
+import { ParsedSmtcmpBlock, parsesmtcmpBlocks } from './parse-smtcmp-block'
+
+describe('parsesmtcmpBlocks', () => {
+  it('should parse a string with smtcmp_block elements', () => {
+    const input = `Some text before
+<smtcmp_block language="markdown" filename="example.md">
+# Example Markdown
+
+This is a sample markdown content for testing purposes.
+
+## Features
+
+- Lists
+- **Bold text**
+- *Italic text*
+- [Links](https://example.com)
+
+### Code Block
+\`\`\`python
+print("Hello, world!")
+\`\`\`
+</smtcmp_block>
+Some text after`
+
+    const expected: ParsedSmtcmpBlock[] = [
+      { type: 'string', content: 'Some text before\n' },
+      {
+        type: 'smtcmp_block',
+        content: `
+# Example Markdown
+
+This is a sample markdown content for testing purposes.
+
+## Features
+
+- Lists
+- **Bold text**
+- *Italic text*
+- [Links](https://example.com)
+
+### Code Block
+\`\`\`python
+print("Hello, world!")
+\`\`\`
+`,
+        language: 'markdown',
+        filename: 'example.md',
+      },
+      { type: 'string', content: '\nSome text after' },
+    ]
+
+    const result = parsesmtcmpBlocks(input)
+    expect(result).toEqual(expected)
+  })
+
+  it('should handle empty smtcmp_block elements', () => {
+    const input = `
+      <smtcmp_block language="python"></smtcmp_block>
+    `
+
+    const expected: ParsedSmtcmpBlock[] = [
+      { type: 'string', content: '\n      ' },
+      {
+        type: 'smtcmp_block',
+        content: '',
+        language: 'python',
+        filename: undefined,
+      },
+      { type: 'string', content: '\n    ' },
+    ]
+
+    const result = parsesmtcmpBlocks(input)
+    expect(result).toEqual(expected)
+  })
+
+  it('should handle input without smtcmp_block elements', () => {
+    const input = 'Just a regular string without any smtcmp_block elements.'
+
+    const expected: ParsedSmtcmpBlock[] = [{ type: 'string', content: input }]
+
+    const result = parsesmtcmpBlocks(input)
+    expect(result).toEqual(expected)
+  })
+
+  it('should handle multiple smtcmp_block elements', () => {
+    const input = `Start
+<smtcmp_block language="python" filename="script.py">
+def greet(name):
+    print(f"Hello, {name}!")
+</smtcmp_block>
+Middle
+<smtcmp_block language="markdown" filename="example.md">
+# Using tildes for code blocks
+
+Did you know that you can use tildes for code blocks?
+
+~~~python
+print("Hello, world!")
+~~~
+</smtcmp_block>
+End`
+
+    const expected: ParsedSmtcmpBlock[] = [
+      { type: 'string', content: 'Start\n' },
+      {
+        type: 'smtcmp_block',
+        content: `
+def greet(name):
+    print(f"Hello, {name}!")
+`,
+        language: 'python',
+        filename: 'script.py',
+      },
+      { type: 'string', content: '\nMiddle\n' },
+      {
+        type: 'smtcmp_block',
+        content: `
+# Using tildes for code blocks
+
+Did you know that you can use tildes for code blocks?
+
+~~~python
+print("Hello, world!")
+~~~
+`,
+        language: 'markdown',
+        filename: 'example.md',
+      },
+      { type: 'string', content: '\nEnd' },
+    ]
+
+    const result = parsesmtcmpBlocks(input)
+    expect(result).toEqual(expected)
+  })
+
+  it('should handle unfinished smtcmp_block with only opening tag', () => {
+    const input = `Start
+<smtcmp_block language="markdown">
+# Unfinished smtcmp_block
+
+Some text after without closing tag`
+    const expected: ParsedSmtcmpBlock[] = [
+      { type: 'string', content: 'Start\n' },
+      {
+        type: 'smtcmp_block',
+        content: `
+# Unfinished smtcmp_block
+
+Some text after without closing tag`,
+        language: 'markdown',
+        filename: undefined,
+      },
+    ]
+
+    const result = parsesmtcmpBlocks(input)
+    expect(result).toEqual(expected)
+  })
+})
diff --git a/src/utils/parse-smtcmp-block.ts b/src/utils/parse-smtcmp-block.ts
@@ -0,0 +1,72 @@
+import { parseFragment } from 'parse5'
+
+export type ParsedSmtcmpBlock =
+  | { type: 'string'; content: string }
+  | {
+      type: 'smtcmp_block'
+      content: string
+      language?: string
+      filename?: string
+    }
+
+export function parsesmtcmpBlocks(input: string): ParsedSmtcmpBlock[] {
+  const parsedResult: ParsedSmtcmpBlock[] = []
+  const fragment = parseFragment(input, {
+    sourceCodeLocationInfo: true,
+  })
+  let lastEndOffset = 0
+  for (const node of fragment.childNodes) {
+    if (node.nodeName === 'smtcmp_block') {
+      if (!node.sourceCodeLocation) {
+        throw new Error('sourceCodeLocation is undefined')
+      }
+      const startOffset = node.sourceCodeLocation.startOffset
+      const endOffset = node.sourceCodeLocation.endOffset
+      if (startOffset > lastEndOffset) {
+        parsedResult.push({
+          type: 'string',
+          content: input.slice(lastEndOffset, startOffset),
+        })
+      }
+
+      const language = node.attrs.find(
+        (attr) => attr.name === 'language',
+      )?.value
+      const filename = node.attrs.find(
+        (attr) => attr.name === 'filename',
+      )?.value
+
+      const children = node.childNodes
+      if (children.length === 0) {
+        parsedResult.push({
+          type: 'smtcmp_block',
+          content: '',
+          language,
+          filename,
+        })
+      } else {
+        const innerContentStartOffset =
+          children[0].sourceCodeLocation?.startOffset
+        const innerContentEndOffset =
+          children[children.length - 1].sourceCodeLocation?.endOffset
+        if (!innerContentStartOffset || !innerContentEndOffset) {
+          throw new Error('sourceCodeLocation is undefined')
+        }
+        parsedResult.push({
+          type: 'smtcmp_block',
+          content: input.slice(innerContentStartOffset, innerContentEndOffset),
+          language,
+          filename,
+        })
+      }
+      lastEndOffset = endOffset
+    }
+  }
+  if (lastEndOffset < input.length) {
+    parsedResult.push({
+      type: 'string',
+      content: input.slice(lastEndOffset),
+    })
+  }
+  return parsedResult
+}