Skip to content

Commit

Permalink
Merge pull request #123 from billhails/utf8-parser
Browse files Browse the repository at this point in the history
Utf8 parser
  • Loading branch information
billhails authored Oct 21, 2024
2 parents 8404755 + 50302e3 commit f7a9bfb
Show file tree
Hide file tree
Showing 24 changed files with 339 additions and 204 deletions.
6 changes: 5 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,8 @@ EXTRA_TARGETS= \
$(EXTRA_OBJTYPES_H_TARGETS) \
$(EXTRA_DEBUG_H_TARGETS) \
$(EXTRA_DEBUG_C_TARGETS) \
generated/UnicodeData.inc
generated/UnicodeData.inc \
generated/UnicodeDigits.inc

MAIN=src/main.c
PREAMBLE=generated/preamble.c
Expand Down Expand Up @@ -184,6 +185,9 @@ unicode/UnicodeData.txt: | unicode
generated/UnicodeData.inc: unicode/UnicodeData.txt tools/analyzeCsv.py | generated
$(PYTHON) ./tools/analyzeCsv.py > $@

generated/UnicodeDigits.inc: unicode/UnicodeData.txt tools/makeUnicodeDigits.py | generated
$(PYTHON) ./tools/makeUnicodeDigits.py > $@

realclean: clean
rm -rf tags unicode

Expand Down
7 changes: 2 additions & 5 deletions docs/generated/anf.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ CexpCond --cases--> CexpCondCases
CexpIntCondCases --option--> MaybeBigInt
CexpIntCondCases --body--> Exp
CexpIntCondCases --next--> CexpIntCondCases
CexpCharCondCases --option--> char
CexpCharCondCases --option--> character
CexpCharCondCases --body--> Exp
CexpCharCondCases --next--> CexpCharCondCases
CexpMatch --condition--> Aexp
Expand All @@ -73,15 +73,12 @@ ExpLookup --annotatedVar--> AexpAnnotatedVar
ExpLookup --body--> Exp
CexpCondCases --charCases--> CexpCharCondCases
CexpCondCases --intCases--> CexpIntCondCases
Aexp --t--> void_ptr
Aexp --f--> void_ptr
Aexp --v--> void_ptr
Aexp --lam--> AexpLam
Aexp --var--> HashSymbol
Aexp --annotatedVar--> AexpAnnotatedVar
Aexp --biginteger--> MaybeBigInt
Aexp --littleinteger--> int
Aexp --character--> char
Aexp --character--> character
Aexp --prim--> AexpPrimApp
Aexp --unary--> AexpUnaryApp
Aexp --makeVec--> AexpMakeVec
Expand Down
5 changes: 2 additions & 3 deletions docs/generated/ast.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ AstArg --named--> AstNamedArg
AstArg --unpack--> AstUnpack
AstArg --unpackStruct--> AstUnpackStruct
AstArg --number--> MaybeBigInt
AstArg --character--> char
AstArg --character--> character
AstArg --tuple--> AstArgList
AstExpression --back--> void_ptr
AstExpression --wildcard--> void_ptr
Expand All @@ -116,7 +116,7 @@ AstExpression --lookup--> AstLookup
AstExpression --symbol--> HashSymbol
AstExpression --gensym--> HashSymbol
AstExpression --number--> MaybeBigInt
AstExpression --character--> char
AstExpression --character--> character
AstExpression --fun--> AstCompositeFunction
AstExpression --nest--> AstNest
AstExpression --iff--> AstIff
Expand All @@ -127,7 +127,6 @@ AstExpression --structure--> AstStruct
AstExpression --assertion--> AstExpression
AstExpression --error--> AstExpression
AstPosition["enum AstPosition"]
AstCharArray["AstCharArray[]"] --entries--> char
AstNamespaceArray["AstNamespaceArray[]"] --entries--> AstNamespaceImpl
AstFileIdArray["AstFileIdArray[]"] --entries--> file_id
AstStringArray["AstStringArray[]"] --entries--> string
Expand Down
4 changes: 2 additions & 2 deletions docs/generated/cekfs.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ Value --stdint_imag--> int
Value --bigint_imag--> BigInt
Value --irrational_imag--> double
Value --complex--> Vec
Value --character--> char
Value --character--> character
Value --clo--> Clo
Value --pclo--> Clo
Value --kont--> Kont
Expand All @@ -46,7 +46,7 @@ Value --opaque--> opaque
ByteCodeArray["ByteCodeArray[]"] --entries--> byte
Stack["Stack[]"] --entries--> Value
ByteCodes["enum ByteCodes"]
CharArray["CharArray[]"] --entries--> char
CharacterArray["CharacterArray[]"] --entries--> character
ByteArray["ByteArray[]"] --entries--> byte
Frame["Frame[]"] --entries--> Value
ValueVal
Expand Down
4 changes: 2 additions & 2 deletions docs/generated/lambda.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ LamCond --cases--> LamCondCases
LamIntCondCases --constant--> MaybeBigInt
LamIntCondCases --body--> LamExp
LamIntCondCases --next--> LamIntCondCases
LamCharCondCases --constant--> char
LamCharCondCases --constant--> character
LamCharCondCases --body--> LamExp
LamCharCondCases --next--> LamCharCondCases
LamMatch --index--> LamExp
Expand Down Expand Up @@ -141,7 +141,7 @@ LamExp --match--> LamMatch
LamExp --cond--> LamCond
LamExp --amb--> LamAmb
LamExp --print--> LamPrint
LamExp --character--> char
LamExp --character--> character
LamExp --back--> void_ptr
LamExp --error--> void_ptr
LamExp --cond_default--> void_ptr
Expand Down
6 changes: 3 additions & 3 deletions docs/generated/pratt.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ PrattTrie --character--> byte
PrattTrie --terminal--> HashSymbol
PrattTrie --siblings--> PrattTrie
PrattTrie --children--> PrattTrie
PrattBuffer --data--> string
PrattBuffer --start--> string
PrattBuffer --data--> ustring
PrattBuffer --start--> ustring
PrattBuffer --length--> int
PrattBufList --lineno--> int
PrattBufList --filename--> HashSymbol
Expand Down Expand Up @@ -50,7 +50,7 @@ PrattNumberState["enum PrattNumberState"]
PrattStringState["enum PrattStringState"]
PrattFixity["enum PrattFixity"]
PrattUTF8["PrattUTF8[]"] --entries--> uchar
PrattUnicode["PrattUnicode[]"] --entries--> char
PrattUnicode["PrattUnicode[]"] --entries--> character
PrattValueVal
PrattValueType
```
Expand Down
2 changes: 1 addition & 1 deletion docs/generated/tpmc.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ TpmcPatternValue --var--> HashSymbol
TpmcPatternValue --comparison--> TpmcComparisonPattern
TpmcPatternValue --assignment--> TpmcAssignmentPattern
TpmcPatternValue --wildcard--> void_ptr
TpmcPatternValue --character--> char
TpmcPatternValue --character--> character
TpmcPatternValue --biginteger--> MaybeBigInt
TpmcPatternValue --constructor--> TpmcConstructorPattern
TpmcPatternValue --tuple--> TpmcPatternArray
Expand Down
2 changes: 1 addition & 1 deletion src/bytecode.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
# include "cekfs.h"

// MUST remember to increment this if bytecodes change
# define CEKF_BYTECODE_VERSION 4
# define CEKF_BYTECODE_VERSION 5

enum ReadByteCodeStatus {
BYTECODES_OK,
Expand Down
91 changes: 43 additions & 48 deletions src/lambda_conversion.c
Original file line number Diff line number Diff line change
Expand Up @@ -36,42 +36,26 @@

char *lambda_conversion_function = NULL; // set by --lambda-conversion flag

static LamLetRecBindings *convertFuncDefs(AstDefinitions *definitions,
LamContext *env);
static LamList *convertExpressions(AstExpressions *expressions,
LamContext *env);
static LamSequence *convertSequence(AstExpressions *expressions,
LamContext *env);
static LamLetRecBindings *prependDefinition(AstDefinition *definition,
LamContext *env,
LamLetRecBindings *next);
static LamLetRecBindings *prependDefine(AstDefine *define, LamContext *env, LamLetRecBindings *next);
static LamLetRecBindings *prependGensymDefine(AstGensymDefine *define, LamContext *env, LamLetRecBindings *next);
static LamExp *convertExpression(AstExpression *expression, LamContext *env);
static bool typeHasFields(AstTypeBody *typeBody);
static LamTypeDefList *collectTypeDefs(AstDefinitions *definitions,
LamContext *env);
static void collectAliases(AstDefinitions *definitions, LamContext *env);
static void collectMacros(AstDefinitions *definitions, LamContext *env);
static LamTypeConstructor *collectTypeConstructor(AstTypeConstructor
*typeConstructor,
LamType *type, int size,
int index, bool needsVec,
LamContext *env);
static void collectTypeInfo(HashSymbol *symbol, AstTypeConstructorArgs *args,
LamTypeConstructor *type,
bool needsVec, int enumCount, int index,
int arity, LamContext *env);
static LamTypeConstructorArgs *convertAstTypeList(AstTypeList *typeList, LamContext *env);
static LamTypeConstructorArgs *convertAstTypeMap(AstTypeMap *typeMap, LamContext *env);
static LamTypeConstructorArgs *convertAstTypeConstructorArgs(AstTypeConstructorArgs *args, LamContext *env);
static HashSymbol *dollarSubstitute(HashSymbol *original);
static LamExp *convertNest(AstNest *nest, LamContext *env);
static LamExp *lamConvertDefsNsAndExprs(AstDefinitions *definitions,
AstNamespaceArray *nsArray,
AstExpressions *expressions,
LamContext *env);
static LamExp *convertSymbol(ParserInfo I, HashSymbol *symbol, LamContext *env);
static LamLetRecBindings *convertFuncDefs(AstDefinitions *, LamContext *);
static LamList *convertExpressions(AstExpressions *, LamContext *);
static LamSequence *convertSequence(AstExpressions *, LamContext *);
static LamLetRecBindings *prependDefinition(AstDefinition *, LamContext *, LamLetRecBindings *);
static LamLetRecBindings *prependDefine(AstDefine *, LamContext *, LamLetRecBindings *);
static LamLetRecBindings *prependGensymDefine(AstGensymDefine *, LamContext *, LamLetRecBindings *);
static LamExp *convertExpression(AstExpression *, LamContext *);
static bool typeHasFields(AstTypeBody *);
static LamTypeDefList *collectTypeDefs(AstDefinitions *, LamContext *);
static void collectAliases(AstDefinitions *, LamContext *);
static void collectMacros(AstDefinitions *, LamContext *);
static LamTypeConstructor *collectTypeConstructor(AstTypeConstructor *, LamType *, int, int, bool, LamContext *);
static void collectTypeInfo(HashSymbol *, AstTypeConstructorArgs *, LamTypeConstructor *, bool, int, int, int, LamContext *);
static LamTypeConstructorArgs *convertAstTypeList(AstTypeList *, LamContext *);
static LamTypeConstructorArgs *convertAstTypeMap(AstTypeMap *, LamContext *);
static LamTypeConstructorArgs *convertAstTypeConstructorArgs(AstTypeConstructorArgs *, LamContext *);
static HashSymbol *dollarSubstitute(HashSymbol *);
static LamExp *convertNest(AstNest *, LamContext *);
static LamExp *lamConvertDefsNsAndExprs(AstDefinitions *, AstNamespaceArray *, AstExpressions *, LamContext *);
static LamExp *convertSymbol(ParserInfo, HashSymbol *, LamContext *);

#ifdef DEBUG_LAMBDA_CONVERT
# include "debugging_on.h"
Expand All @@ -91,6 +75,10 @@ static void conversionError(ParserInfo I, char *message, ...) {
can_happen(" at +%d %s", I.lineno, I.filename);
}

static LamExp *lamExpError(ParserInfo I) {
return newLamExp_Var(I, errorSymbol());
}

static void addCurrentNamespaceToContext(LamContext *context, int id) {
LamInfo *lamInfo = newLamInfo_Nsid(CPI(context), id);
int save = PROTECT(lamInfo);
Expand Down Expand Up @@ -786,13 +774,13 @@ static HashSymbol *dollarSubstitute(HashSymbol *symbol) {
#define CHECK_ONE_ARG(name, args) do { \
int count = countLamList(args); \
if (count != 1) \
cant_happen("expected 1 arg in " #name ", got %d", count); \
conversionError(CPI(args), "expected 1 arg in " #name ", got %d", count); \
} while(0)

#define CHECK_TWO_ARGS(name, args) do { \
int count = countLamList(args); \
if (count != 2) \
cant_happen("expected 2 args in " #name ", got %d", count); \
conversionError(CPI(args), "expected 2 args in " #name ", got %d", count); \
} while(0)

static LamExp *makeCallCC(LamList *args) {
Expand Down Expand Up @@ -851,7 +839,7 @@ static void bindMacroArgs(LamExpTable *table, LamVarList *fargs, LamList *aargs)
static LamExp *expandMacro(HashSymbol *name, LamMacro *macro, LamList *args) {
if (countLamList(args) != countLamVarList(macro->args)) {
conversionError(CPI(args), "wrong number of arguments to macro %s", name->name);
return newLamExp_Var(CPI(args), name);
return newLamExp_Error(CPI(args));
}
if (countLamList(args) == 0) {
return macro->exp;
Expand Down Expand Up @@ -1009,7 +997,8 @@ static void checkNoUnrecognisedTags(LamTypeTags *lamTags, AstTaggedExpressions *
static void checkTagNotDuplicate(HashSymbol *tag, AstTaggedExpressions *tags) {
if (tags == NULL) return;
if (tag == tags->tag) {
cant_happen("duplicate tag %s", tag->name);
conversionError(CPI(tags), "duplicate tag %s", tag->name);
return;
}
checkTagNotDuplicate(tag, tags->next);
}
Expand Down Expand Up @@ -1078,15 +1067,17 @@ static LamExp *makeConstructorApplication(LamExp *constructor, LamList *args) {

static LamExp *makeStructureApplication(LamExp *constructor, AstTaggedExpressions *tags, LamContext *env) {
if (constructor->val.constructor->tags == NULL) {
cant_happen("non-struct constructor applied to struct");
conversionError(CPI(constructor), "non-struct constructor applied to struct");
return lamExpError(CPI(tags));
}
checkAllTagsPresent(constructor->val.constructor->tags, tags);
checkNoUnrecognisedTags(constructor->val.constructor->tags, tags);
checkNoDuplicateTags(tags);
int arity = findUnderlyingArity(constructor);
int nargs = (int) countAstTaggedExpressions(tags);
if (nargs != arity) {
cant_happen("wrong number of args in structure application");
conversionError(CPI(constructor), "wrong number of args in structure application");
return lamExpError(CPI(tags));
}
LamList *args = convertTagsToArgs(constructor->val.constructor->tags, tags, env);
int save = PROTECT(args);
Expand Down Expand Up @@ -1117,7 +1108,8 @@ static LamTypeConstructorInfo *findConstructor(AstLookupOrSymbol *los, LamContex
static LamExp *convertStructure(AstStruct *structure, LamContext *env) {
LamTypeConstructorInfo *info = findConstructor(structure->symbol, env);
if (info == NULL) {
cant_happen("cannot find constructor");
conversionError(CPI(structure), "cannot find constructor");
return lamExpError(CPI(structure));
}
LamExp *constructor = newLamExp_Constructor(CPI(info), info);
int save = PROTECT(constructor);
Expand Down Expand Up @@ -1210,7 +1202,8 @@ static AstArgList *rewriteAstTaggedArgList(LamTypeTags *allTags, AstTaggedArgLis
static AstArg *rewriteAstUnpackStruct(AstUnpackStruct *structure, LamContext *env) {
LamTypeConstructorInfo *info = findConstructor(structure->symbol, env);
if (info->tags == NULL) {
cant_happen("constructor not a struct");
conversionError(CPI(structure), "constructor not a struct");
return newAstArg_Wildcard(CPI(structure));
}
AstArgList *args = rewriteAstTaggedArgList(info->tags, structure->argList, env);
int save = PROTECT(args);
Expand Down Expand Up @@ -1290,10 +1283,12 @@ static LamLam *convertCompositeBodies(int nargs, AstCompositeFunction *fun,
return result;
}

static LamExp *convertCompositeFun(AstCompositeFunction *fun, LamContext *env) {
static LamExp *convertCompositeFun(ParserInfo PI, AstCompositeFunction *fun, LamContext *env) {
ENTER(convertCompositeFun);
if (fun == NULL)
cant_happen("composite function with no components");
if (fun == NULL) {
conversionError(PI, "composite function with no components");
return lamExpError(PI);
}
int nargs = countAstArgList(fun->function->argList);
LamLam *lambda = convertCompositeBodies(nargs, fun, env);
DEBUG("convertCompositeBodies returned %p", lambda);
Expand Down Expand Up @@ -1395,7 +1390,7 @@ static LamExp *convertExpression(AstExpression *expression, LamContext *env) {
break;
case AST_EXPRESSION_TYPE_FUN:
DEBUG("fun");
result = convertCompositeFun(expression->val.fun, env);
result = convertCompositeFun(CPI(expression), expression->val.fun, env);
break;
case AST_EXPRESSION_TYPE_NEST:
DEBUG("nest");
Expand Down
4 changes: 2 additions & 2 deletions src/pratt.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@ structs:

# Both a parse buffer and a parse token data type (like yytext)
PrattBuffer:
data: string
start: string=NULL
data: ustring
start: ustring=NULL
length: int=0

# Stack of buffers parsed in order
Expand Down
Loading

0 comments on commit f7a9bfb

Please sign in to comment.