Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Parse input with parse5 #5

Merged
merged 2 commits into from
Oct 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
"lodash.isequal": "^4.5.0",
"lucide-react": "^0.447.0",
"openai": "^4.65.0",
"parse5": "^7.1.2",
"react": "^18.3.1",
"react-dom": "^18.3.1",
"react-markdown": "^9.0.1",
Expand Down
58 changes: 6 additions & 52 deletions src/components/ReactMarkdown.tsx
Original file line number Diff line number Diff line change
@@ -1,56 +1,11 @@
import Markdown from 'react-markdown'

import MarkdownCodeComponent from './MarkdownCodeComponent'

function parsesmtcmpBlocks(input: string): (
| { type: 'string'; content: string }
| {
type: 'smtcmpBlock'
content: string
language?: string
filename?: string
}
)[] {
const regex = /<smtcmpBlock([^>]*)>\s*([\s\S]*?)\s*(?:<\/smtcmpBlock>|$)/g
const matches = input.matchAll(regex)
const result: (
| { type: 'string'; content: string }
| {
type: 'smtcmpBlock'
content: string
language?: string
filename?: string
}
)[] = []

let lastIndex = 0
for (const match of matches) {
if (match.index > lastIndex) {
result.push({
type: 'string',
content: input.slice(lastIndex, match.index),
})
}
const [, attributes, content] = match
const language = attributes.match(/language="([^"]+)"/)?.[1]
const filename = attributes.match(/filename="([^"]+)"/)?.[1]
result.push({
type: 'smtcmpBlock',
content,
language,
filename,
})
lastIndex = match.index + match[0].length
}
if (lastIndex < input.length) {
result.push({
type: 'string',
content: input.slice(lastIndex),
})
}
import {
ParsedSmtcmpBlock,
parsesmtcmpBlocks,
} from '../utils/parse-smtcmp-block'

return result
}
import MarkdownCodeComponent from './MarkdownCodeComponent'

export default function ReactMarkdown({
onApply,
Expand All @@ -61,8 +16,7 @@ export default function ReactMarkdown({
children: string
isApplying: boolean
}) {
const blocks = parsesmtcmpBlocks(children)

const blocks: ParsedSmtcmpBlock[] = parsesmtcmpBlocks(children)
return (
<>
{blocks.map((block, index) =>
Expand Down
10 changes: 5 additions & 5 deletions src/utils/apply.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,14 @@ const systemPrompt = `You are an intelligent assistant helping a user apply chan

You will receive:
1. The content of the target markdown file.
2. A conversation history between the user and the assistant. This conversation may contain multiple markdown blocks suggesting changes to the file. Markdown blocks are indicated by the <smtcmpBlock> tag. For example:
<smtcmpBlock>
2. A conversation history between the user and the assistant. This conversation may contain multiple markdown blocks suggesting changes to the file. Markdown blocks are indicated by the <smtcmp_block> tag. For example:
<smtcmp_block>
<!-- ... existing content ... -->
{{ edit_1 }}
<!-- ... existing content ... -->
{{ edit_2 }}
<!-- ... existing content ... -->
</smtcmpBlock>
</smtcmp_block>
3. A single, specific markdown block extracted from the conversation history. This block contains the exact changes that should be applied to the target file.

Please rewrite the entire markdown file with ONLY the changes from the specified markdown block applied. DO NOT apply changes suggested by other parts of the conversation. Preserve all parts of the original file that are not related to the changes. Output only the file content, without any additional words or explanations.`
Expand Down Expand Up @@ -72,9 +72,9 @@ ${chatMessages

## Changes to Apply
Here is the markdown block that indicates where content changes should be applied.
<smtcmpBlock>
<smtcmp_block>
${blockToApply}
</smtcmpBlock>
</smtcmp_block>

Now rewrite the entire file with the changes applied. Immediately start your response with \`\`\`${currentFile.path}`
}
Expand Down
158 changes: 158 additions & 0 deletions src/utils/parse-smtcmp-block.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
import { ParsedSmtcmpBlock, parsesmtcmpBlocks } from './parse-smtcmp-block'

describe('parsesmtcmpBlocks', () => {
it('should parse a string with smtcmp_block elements', () => {
const input = `Some text before
<smtcmp_block language="markdown" filename="example.md">
# Example Markdown

This is a sample markdown content for testing purposes.

## Features

- Lists
- **Bold text**
- *Italic text*
- [Links](https://example.com)

### Code Block
\`\`\`python
print("Hello, world!")
\`\`\`
</smtcmp_block>
Some text after`

const expected: ParsedSmtcmpBlock[] = [
{ type: 'string', content: 'Some text before\n' },
{
type: 'smtcmp_block',
content: `
# Example Markdown

This is a sample markdown content for testing purposes.

## Features

- Lists
- **Bold text**
- *Italic text*
- [Links](https://example.com)

### Code Block
\`\`\`python
print("Hello, world!")
\`\`\`
`,
language: 'markdown',
filename: 'example.md',
},
{ type: 'string', content: '\nSome text after' },
]

const result = parsesmtcmpBlocks(input)
expect(result).toEqual(expected)
})

it('should handle empty smtcmp_block elements', () => {
const input = `
<smtcmp_block language="python"></smtcmp_block>
`

const expected: ParsedSmtcmpBlock[] = [
{ type: 'string', content: '\n ' },
{
type: 'smtcmp_block',
content: '',
language: 'python',
filename: undefined,
},
{ type: 'string', content: '\n ' },
]

const result = parsesmtcmpBlocks(input)
expect(result).toEqual(expected)
})

it('should handle input without smtcmp_block elements', () => {
const input = 'Just a regular string without any smtcmp_block elements.'

const expected: ParsedSmtcmpBlock[] = [{ type: 'string', content: input }]

const result = parsesmtcmpBlocks(input)
expect(result).toEqual(expected)
})

it('should handle multiple smtcmp_block elements', () => {
const input = `Start
<smtcmp_block language="python" filename="script.py">
def greet(name):
print(f"Hello, {name}!")
</smtcmp_block>
Middle
<smtcmp_block language="markdown" filename="example.md">
# Using tildes for code blocks

Did you know that you can use tildes for code blocks?

~~~python
print("Hello, world!")
~~~
</smtcmp_block>
End`

const expected: ParsedSmtcmpBlock[] = [
{ type: 'string', content: 'Start\n' },
{
type: 'smtcmp_block',
content: `
def greet(name):
print(f"Hello, {name}!")
`,
language: 'python',
filename: 'script.py',
},
{ type: 'string', content: '\nMiddle\n' },
{
type: 'smtcmp_block',
content: `
# Using tildes for code blocks

Did you know that you can use tildes for code blocks?

~~~python
print("Hello, world!")
~~~
`,
language: 'markdown',
filename: 'example.md',
},
{ type: 'string', content: '\nEnd' },
]

const result = parsesmtcmpBlocks(input)
expect(result).toEqual(expected)
})

it('should handle unfinished smtcmp_block with only opening tag', () => {
const input = `Start
<smtcmp_block language="markdown">
# Unfinished smtcmp_block

Some text after without closing tag`
const expected: ParsedSmtcmpBlock[] = [
{ type: 'string', content: 'Start\n' },
{
type: 'smtcmp_block',
content: `
# Unfinished smtcmp_block

Some text after without closing tag`,
language: 'markdown',
filename: undefined,
},
]

const result = parsesmtcmpBlocks(input)
expect(result).toEqual(expected)
})
})
72 changes: 72 additions & 0 deletions src/utils/parse-smtcmp-block.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import { parseFragment } from 'parse5'

export type ParsedSmtcmpBlock =
| { type: 'string'; content: string }
| {
type: 'smtcmp_block'
content: string
language?: string
filename?: string
}

export function parsesmtcmpBlocks(input: string): ParsedSmtcmpBlock[] {
const parsedResult: ParsedSmtcmpBlock[] = []
const fragment = parseFragment(input, {
sourceCodeLocationInfo: true,
})
let lastEndOffset = 0
for (const node of fragment.childNodes) {
if (node.nodeName === 'smtcmp_block') {
if (!node.sourceCodeLocation) {
throw new Error('sourceCodeLocation is undefined')
}
const startOffset = node.sourceCodeLocation.startOffset
const endOffset = node.sourceCodeLocation.endOffset
if (startOffset > lastEndOffset) {
parsedResult.push({
type: 'string',
content: input.slice(lastEndOffset, startOffset),
})
}

const language = node.attrs.find(
(attr) => attr.name === 'language',
)?.value
const filename = node.attrs.find(
(attr) => attr.name === 'filename',
)?.value

const children = node.childNodes
if (children.length === 0) {
parsedResult.push({
type: 'smtcmp_block',
content: '',
language,
filename,
})
} else {
const innerContentStartOffset =
children[0].sourceCodeLocation?.startOffset
const innerContentEndOffset =
children[children.length - 1].sourceCodeLocation?.endOffset
if (!innerContentStartOffset || !innerContentEndOffset) {
throw new Error('sourceCodeLocation is undefined')
}
parsedResult.push({
type: 'smtcmp_block',
content: input.slice(innerContentStartOffset, innerContentEndOffset),
language,
filename,
})
}
lastEndOffset = endOffset
}
}
if (lastEndOffset < input.length) {
parsedResult.push({
type: 'string',
content: input.slice(lastEndOffset),
})
}
return parsedResult
}
Loading
Loading