summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAdrian Thurston <thurston@colm.net>2020-01-08 16:55:45 +0200
committerAdrian Thurston <thurston@colm.net>2020-01-08 16:58:39 +0200
commit1bf8df38ebf912b454c7db1fe2e2a38c91526900 (patch)
treed3d8cb9ea15a03a94f5ed7cae4ddc099ae365952
parent6c670b0a9851bd5ec3c7a90f20b164be2a54c5b2 (diff)
downloadcolm-1bf8df38ebf912b454c7db1fe2e2a38c91526900.tar.gz
colm: using an unsigned char as alph type in scanner
Switching to unsigned so we can reference chars with the high bit set in the way that is common when discussing utf and unicode: using hex chars. Negative values are really uncomfortable and don't lend themselves to specifying ranges. This change is for the parser and stream/input code only. The collected tree data still uses char, which makes it convenient to extract the data from a top-down or bottom-up load. If we change the tree type we need to cast the pointers all over the loading code. refs #104 refs #97 refs #81
-rw-r--r--colm/bytecode.c31
-rw-r--r--colm/colm.h4
-rw-r--r--colm/compiler.cc8
-rw-r--r--colm/compiler.h2
-rw-r--r--colm/ctinput.cc27
-rw-r--r--colm/input.c10
-rw-r--r--colm/input.h30
-rw-r--r--colm/keyops.h8
-rw-r--r--colm/main.cc13
-rw-r--r--colm/pdacodegen.cc2
-rw-r--r--colm/pdarun.c32
-rw-r--r--colm/pdarun.h8
-rw-r--r--colm/print.c11
-rw-r--r--colm/stream.c28
-rw-r--r--colm/string.c4
15 files changed, 122 insertions, 96 deletions
diff --git a/colm/bytecode.c b/colm/bytecode.c
index 155a6a25..af50f679 100644
--- a/colm/bytecode.c
+++ b/colm/bytecode.c
@@ -35,6 +35,7 @@
#include <colm/pool.h>
#include <colm/debug.h>
+#include <colm/colm.h>
#define TRUE_VAL 1
#define FALSE_VAL 0
@@ -215,7 +216,7 @@ static word_t stream_append_text( program_t *prg, tree_t **sp, input_t *dest, tr
colm_print_tree_collect( prg, sp, &collect, input, trim );
/* Load it into the input. */
- impl->funcs->append_data( prg, impl, collect.data, collect.length );
+ impl->funcs->append_data( prg, impl, colm_alph_from_cstr( collect.data ), collect.length );
length = collect.length;
str_collect_destroy( &collect );
}
@@ -238,7 +239,7 @@ static word_t stream_append_tree( program_t *prg, tree_t **sp, input_t *dest, tr
colm_print_tree_collect( prg, sp, &collect, to_append, false );
/* Load it into the to_append. */
- impl->funcs->append_data( prg, impl, collect.data, collect.length );
+ impl->funcs->append_data( prg, impl, colm_alph_from_cstr( collect.data ), collect.length );
length = collect.length;
str_collect_destroy( &collect );
}
@@ -288,10 +289,11 @@ static tree_t *stream_pull_bc( program_t *prg, tree_t **sp, struct pda_run *pda_
}
-static void undo_stream_pull( struct colm_program *prg, struct input_impl *is, const char *data, long length )
+static void undo_stream_pull( struct colm_program *prg, struct input_impl *is,
+ const char *data, long length )
{
//debug( REALM_PARSE, "undoing stream pull\n" );
- is->funcs->undo_consume_data( prg, is, data, length );
+ is->funcs->undo_consume_data( prg, is, colm_alph_from_cstr( data ), length );
}
static void undo_pull( program_t *prg, input_t *input, tree_t *str )
@@ -407,12 +409,14 @@ static void downref_locals( program_t *prg, tree_t ***psp,
}
}
+
static tree_t *construct_arg0( program_t *prg, int argc, const char **argv, const int *argl )
{
tree_t *arg0 = 0;
if ( argc > 0 ) {
- size_t len = argl != 0 ? argl[0] : strlen(argv[0]);
- head_t *head = colm_string_alloc_pointer( prg, argv[0], len );
+ const char *argv0 = argv[0];
+ size_t len = argl != 0 ? argl[0] : strlen( argv[0] );
+ head_t *head = colm_string_alloc_pointer( prg, argv0, len );
arg0 = construct_string( prg, head );
colm_tree_upref( prg, arg0 );
}
@@ -425,7 +429,8 @@ static list_t *construct_argv( program_t *prg, int argc, const char **argv, cons
int i;
for ( i = 1; i < argc; i++ ) {
size_t len = argl != 0 ? argl[i] : strlen(argv[i]);
- head_t *head = colm_string_alloc_pointer( prg, argv[i], len );
+ const char *argv_i = argv[i];
+ head_t *head = colm_string_alloc_pointer( prg, argv_i, len );
tree_t *arg = construct_string( prg, head );
colm_tree_upref( prg, arg );
@@ -590,7 +595,9 @@ tree_t *colm_run_func( struct colm_program *prg, int frame_id,
((value_t*)execution.call_args)[p] = 0;
}
else {
- head_t *head = colm_string_alloc_pointer( prg, params[p], strlen(params[p]) );
+ const char *param_p = params[p];
+ size_t param_len = strlen(params[p]);
+ head_t *head = colm_string_alloc_pointer( prg, param_p, param_len );
tree_t *tree = construct_string( prg, head );
colm_tree_upref( prg, tree );
((tree_t**)execution.call_args)[p] = tree;
@@ -966,13 +973,14 @@ again:
* the local frame now. */
struct lang_el_info *lel_info = prg->rtd->lel_info;
struct pda_run *pda_run = exec->parser->pda_run;
- char **mark = pda_run->mark;
+ alph_t **mark = pda_run->mark;
int i, num_capture_attr = lel_info[pda_run->token_id].num_capture_attr;
for ( i = 0; i < num_capture_attr; i++ ) {
struct lang_el_info *lei = &lel_info[exec->parser->pda_run->token_id];
CaptureAttr *ca = &prg->rtd->capture_attr[lei->capture_attr + i];
- head_t *data = string_alloc_full( prg, mark[ca->mark_enter],
+ head_t *data = string_alloc_full( prg,
+ colm_cstr_from_alph( mark[ca->mark_enter] ),
mark[ca->mark_leave] - mark[ca->mark_enter] );
tree_t *string = construct_string( prg, data );
colm_tree_upref( prg, string );
@@ -3186,7 +3194,8 @@ again:
tree_t *str = 0;
if ( tree->tokdata->location ) {
const char *fn = tree->tokdata->location->name;
- head_t *data = string_alloc_full( prg, fn, strlen(fn) );
+ size_t fnlen = strlen( fn );
+ head_t *data = string_alloc_full( prg, fn, fnlen );
str = construct_string( prg, data );
colm_tree_upref( prg, str );
}
diff --git a/colm/colm.h b/colm/colm.h
index 5091d27e..39506cab 100644
--- a/colm/colm.h
+++ b/colm/colm.h
@@ -45,6 +45,7 @@ struct indent_impl
extern struct colm_sections colm_object;
typedef unsigned long colm_value_t;
+typedef unsigned char colm_alph_t;
struct colm_tree
{
@@ -99,6 +100,9 @@ struct colm_tree *colm_get_left_repeat_next( struct colm_tree *tree );
struct colm_tree *colm_get_left_repeat_val( struct colm_tree *tree );
struct colm_location *colm_find_location( struct colm_program *prg, struct colm_tree *tree );
+static inline const colm_alph_t *colm_alph_from_cstr( const char *cstr ) { return (const colm_alph_t*)cstr; }
+static inline const char *colm_cstr_from_alph( const colm_alph_t *alph ) { return (const char*)alph; }
+
/* Debug realms. To turn on, pass to colm_set_debug before invocation. */
#define COLM_DBG_BYTECODE 0x00000001
#define COLM_DBG_PARSE 0x00000002
diff --git a/colm/compiler.cc b/colm/compiler.cc
index 6b2b1032..72f87dac 100644
--- a/colm/compiler.cc
+++ b/colm/compiler.cc
@@ -86,12 +86,13 @@ Key makeFsmKeyHex( char *str, const InputLoc &loc, Compiler *pd )
unsigned long ul = strtoul( str, 0, 16 );
+
if ( errno == ERANGE || (unusedBits && ul >> (size * 8)) ) {
error(loc) << "literal " << str << " overflows the alphabet type" << endl;
ul = 1 << (size * 8);
}
- if ( unusedBits && ul >> (size * 8 - 1) )
+ if ( keyOps->alphType->isSigned && unusedBits && ul >> (size * 8 - 1) )
ul |= (ULONG_MAX >> (size*8 ) ) << (size*8);
return Key( (long)ul );
@@ -492,7 +493,8 @@ void Compiler::initGraphDict( )
void Compiler::initKeyOps( )
{
/* Signedness and bounds. */
- HostType *alphType = alphTypeSet ? userAlphType : hostLang->defaultAlphType;
+ const HostType *alphType = alphTypeSet ? userAlphType :
+ &hostLang->hostTypes[hostLang->defaultHostType];
thisKeyOps.setAlphType( alphType );
if ( lowerNum != 0 ) {
@@ -1022,7 +1024,7 @@ pda_run *Compiler::parsePattern( program_t *prg, tree_t **sp, const InputLoc &lo
if ( pdaRun->parse_error_text != 0 ) {
colm_data *tokdata = pdaRun->parse_error_text->tokdata;
cerr << ": relative error: ";
- cerr.write( tokdata->data, tokdata->length );
+ cerr.write( (const char*)tokdata->data, tokdata->length );
}
else {
cerr << ": parse error";
diff --git a/colm/compiler.h b/colm/compiler.h
index f22b33e3..67d5b40e 100644
--- a/colm/compiler.h
+++ b/colm/compiler.h
@@ -596,7 +596,7 @@ struct Compiler
int nextPriorKey, nextNameId;
/* Alphabet type. */
- HostType *userAlphType;
+ const HostType *userAlphType;
bool alphTypeSet;
/* Element type and get key expression. */
diff --git a/colm/ctinput.cc b/colm/ctinput.cc
index efc2c69d..f8267487 100644
--- a/colm/ctinput.cc
+++ b/colm/ctinput.cc
@@ -88,7 +88,7 @@ struct input_impl *colm_impl_new_pat( char *name, Pattern *pattern )
}
int pat_get_parse_block( struct colm_program *prg, struct input_impl_ct *ss, int *pskip,
- char **pdp, int *copied )
+ alph_t **pdp, int *copied )
{
*copied = 0;
@@ -107,7 +107,7 @@ int pat_get_parse_block( struct colm_program *prg, struct input_impl_ct *ss, int
if ( avail > 0 ) {
/* The source data from the current buffer. */
- char *src = &buf->data[offset];
+ alph_t *src = (alph_t*)&buf->data[offset];
int slen = avail;
/* Need to skip? */
@@ -136,7 +136,7 @@ int pat_get_parse_block( struct colm_program *prg, struct input_impl_ct *ss, int
}
-int pat_get_data( struct colm_program *prg, struct input_impl_ct *ss, char *dest, int length )
+int pat_get_data( struct colm_program *prg, struct input_impl_ct *ss, alph_t *dest, int length )
{
int copied = 0;
@@ -215,7 +215,7 @@ int pat_consume_data( struct colm_program *prg, struct input_impl_ct *ss, int le
return consumed;
}
-int pat_undo_consume_data( struct colm_program *prg, struct input_impl_ct *ss, const char *data, int length )
+int pat_undo_consume_data( struct colm_program *prg, struct input_impl_ct *ss, const alph_t *data, int length )
{
int origLen = length;
while ( true ) {
@@ -239,7 +239,7 @@ int pat_undo_consume_data( struct colm_program *prg, struct input_impl_ct *ss, c
}
LangEl *pat_consume_lang_el( struct colm_program *prg, struct input_impl_ct *ss, long *bindId,
- char **data, long *length )
+ alph_t **data, long *length )
{
LangEl *klangEl = ss->pat_item->prodEl->langEl;
*bindId = ss->pat_item->bindId;
@@ -310,7 +310,8 @@ struct input_impl *colm_impl_new_cons( char *name, Constructor *constructor )
return (struct input_impl*)ss;
}
-LangEl *repl_consume_lang_el( struct colm_program *prg, struct input_impl_ct *ss, long *bindId, char **data, long *length )
+LangEl *repl_consume_lang_el( struct colm_program *prg, struct input_impl_ct *ss,
+ long *bindId, alph_t **data, long *length )
{
LangEl *klangEl = ss->cons_item->type == ConsItem::ExprType ?
ss->cons_item->langEl : ss->cons_item->prodEl->langEl;
@@ -326,7 +327,7 @@ LangEl *repl_consume_lang_el( struct colm_program *prg, struct input_impl_ct *ss
ss->cons_item->prodEl->typeRef->pdaLiteral->data,
ss->cons_item->prodEl->typeRef->pdaLiteral->loc );
- *data = ss->cons_item->data;
+ *data = (alph_t*)ss->cons_item->data.data;
*length = ss->cons_item->data.length();
}
}
@@ -337,7 +338,7 @@ LangEl *repl_consume_lang_el( struct colm_program *prg, struct input_impl_ct *ss
}
int repl_get_parse_block( struct colm_program *prg, struct input_impl_ct *ss,
- int *pskip, char **pdp, int *copied )
+ int *pskip, alph_t **pdp, int *copied )
{
*copied = 0;
@@ -356,7 +357,7 @@ int repl_get_parse_block( struct colm_program *prg, struct input_impl_ct *ss,
if ( avail > 0 ) {
/* The source data from the current buffer. */
- char *src = &buf->data[offset];
+ alph_t *src = (alph_t*)&buf->data[offset];
int slen = avail;
/* Need to skip? */
@@ -384,7 +385,7 @@ int repl_get_parse_block( struct colm_program *prg, struct input_impl_ct *ss,
return INPUT_DATA;
}
-int repl_get_data( struct colm_program *prg, struct input_impl_ct *ss, char *dest, int length )
+int repl_get_data( struct colm_program *prg, struct input_impl_ct *ss, alph_t *dest, int length )
{
int copied = 0;
@@ -468,7 +469,7 @@ int repl_consume_data( struct colm_program *prg, struct input_impl_ct *ss, int l
return consumed;
}
-int repl_undo_consume_data( struct colm_program *prg, struct input_impl_ct *ss, const char *data, int length )
+int repl_undo_consume_data( struct colm_program *prg, struct input_impl_ct *ss, const alph_t *data, int length )
{
int origLen = length;
while ( true ) {
@@ -527,7 +528,7 @@ extern "C" void internalSendNamedLangEl( program_t *prg, tree_t **sp,
{
/* All three set by consumeLangEl. */
long bindId;
- char *data;
+ alph_t *data;
long length;
LangEl *klangEl = is->funcs->consume_lang_el( prg, is, &bindId, &data, &length );
@@ -537,7 +538,7 @@ extern "C" void internalSendNamedLangEl( program_t *prg, tree_t **sp,
/* Copy the token data. */
head_t *tokdata = 0;
if ( data != 0 )
- tokdata = string_alloc_full( prg, data, length );
+ tokdata = string_alloc_full( prg, colm_cstr_from_alph( data ), length );
kid_t *input = make_token_with_data( prg, pdaRun, is, klangEl->id, tokdata );
diff --git a/colm/input.c b/colm/input.c
index b8101c6f..4342249a 100644
--- a/colm/input.c
+++ b/colm/input.c
@@ -253,7 +253,7 @@ static void input_set_option( struct colm_program *prg, struct input_impl_seq *i
static int input_get_parse_block( struct colm_program *prg, struct input_impl_seq *is,
- int *pskip, char **pdp, int *copied )
+ int *pskip, alph_t **pdp, int *copied )
{
int ret = 0;
*copied = 0;
@@ -326,7 +326,7 @@ static int input_get_parse_block( struct colm_program *prg, struct input_impl_se
}
static int input_get_data( struct colm_program *prg, struct input_impl_seq *is,
- char *dest, int length )
+ alph_t *dest, int length )
{
int copied = 0;
@@ -414,7 +414,7 @@ static int input_consume_data( struct colm_program *prg, struct input_impl_seq *
}
static int input_undo_consume_data( struct colm_program *prg, struct input_impl_seq *si,
- const char *data, int length )
+ const alph_t *data, int length )
{
/* When we push back data we need to move backwards through the block of
* text. The source stream type will */
@@ -492,7 +492,7 @@ static void input_undo_consume_tree( struct colm_program *prg, struct input_impl
* Prepend
*/
static void input_prepend_data( struct colm_program *prg, struct input_impl_seq *si,
- const char *data, long length )
+ const alph_t *data, long length )
{
debug( prg, REALM_INPUT, "input_prepend_data: stream %p prepend data length %d\n", si, length );
@@ -577,7 +577,7 @@ static tree_t *input_undo_prepend_stream( struct colm_program *prg, struct input
}
static void input_append_data( struct colm_program *prg, struct input_impl_seq *si,
- const char *data, long length )
+ const alph_t *data, long length )
{
debug( prg, REALM_INPUT, "input_append_data: stream %p append data length %d\n", si, length );
diff --git a/colm/input.h b/colm/input.h
index f116561f..c0a896d5 100644
--- a/colm/input.h
+++ b/colm/input.h
@@ -54,24 +54,26 @@ struct colm_stream;
struct input_impl;
struct stream_impl;
+typedef colm_alph_t alph_t;
+
#define DEF_INPUT_FUNCS( input_funcs, _input_impl ) \
struct input_funcs \
{ \
- int (*get_parse_block)( struct colm_program *prg, struct _input_impl *si, int *pskip, char **pdp, int *copied ); \
- int (*get_data)( struct colm_program *prg, struct _input_impl *si, char *dest, int length ); \
+ int (*get_parse_block)( struct colm_program *prg, struct _input_impl *si, int *pskip, alph_t **pdp, int *copied ); \
+ int (*get_data)( struct colm_program *prg, struct _input_impl *si, alph_t *dest, int length ); \
int (*consume_data)( struct colm_program *prg, struct _input_impl *si, int length, struct colm_location *loc ); \
- int (*undo_consume_data)( struct colm_program *prg, struct _input_impl *si, const char *data, int length ); \
+ int (*undo_consume_data)( struct colm_program *prg, struct _input_impl *si, const alph_t *data, int length ); \
struct colm_tree *(*consume_tree)( struct colm_program *prg, struct _input_impl *si ); \
void (*undo_consume_tree)( struct colm_program *prg, struct _input_impl *si, struct colm_tree *tree, int ignore ); \
- struct LangEl *(*consume_lang_el)( struct colm_program *prg, struct _input_impl *si, long *bind_id, char **data, long *length ); \
+ struct LangEl *(*consume_lang_el)( struct colm_program *prg, struct _input_impl *si, long *bind_id, alph_t **data, long *length ); \
void (*undo_consume_lang_el)( struct colm_program *prg, struct _input_impl *si ); \
- void (*prepend_data)( struct colm_program *prg, struct _input_impl *si, const char *data, long len ); \
+ void (*prepend_data)( struct colm_program *prg, struct _input_impl *si, const alph_t *data, long len ); \
int (*undo_prepend_data)( struct colm_program *prg, struct _input_impl *si, int length ); \
void (*prepend_tree)( struct colm_program *prg, struct _input_impl *si, struct colm_tree *tree, int ignore ); \
struct colm_tree *(*undo_prepend_tree)( struct colm_program *prg, struct _input_impl *si ); \
void (*prepend_stream)( struct colm_program *prg, struct _input_impl *si, struct colm_stream *stream ); \
struct colm_tree *(*undo_prepend_stream)( struct colm_program *prg, struct _input_impl *si ); \
- void (*append_data)( struct colm_program *prg, struct _input_impl *si, const char *data, long length ); \
+ void (*append_data)( struct colm_program *prg, struct _input_impl *si, const alph_t *data, long length ); \
struct colm_tree *(*undo_append_data)( struct colm_program *prg, struct _input_impl *si, int length ); \
void (*append_tree)( struct colm_program *prg, struct _input_impl *si, struct colm_tree *tree ); \
struct colm_tree *(*undo_append_tree)( struct colm_program *prg, struct _input_impl *si ); \
@@ -87,11 +89,11 @@ struct input_funcs \
#define DEF_STREAM_FUNCS( stream_funcs, _stream_impl ) \
struct stream_funcs \
{ \
- int (*get_parse_block)( struct colm_program *prg, struct _stream_impl *si, int *pskip, char **pdp, int *copied ); \
- int (*get_data)( struct colm_program *prg, struct _stream_impl *si, char *dest, int length ); \
- int (*get_data_source)( struct colm_program *prg, struct _stream_impl *si, char *dest, int length ); \
+ int (*get_parse_block)( struct colm_program *prg, struct _stream_impl *si, int *pskip, alph_t **pdp, int *copied ); \
+ int (*get_data)( struct colm_program *prg, struct _stream_impl *si, alph_t *dest, int length ); \
+ int (*get_data_source)( struct colm_program *prg, struct _stream_impl *si, alph_t *dest, int length ); \
int (*consume_data)( struct colm_program *prg, struct _stream_impl *si, int length, struct colm_location *loc ); \
- int (*undo_consume_data)( struct colm_program *prg, struct _stream_impl *si, const char *data, int length ); \
+ int (*undo_consume_data)( struct colm_program *prg, struct _stream_impl *si, const alph_t *data, int length ); \
void (*transfer_loc)( struct colm_program *prg, struct colm_location *loc, struct _stream_impl *si ); \
struct colm_str_collect *(*get_collect)( struct colm_program *prg, struct _stream_impl *si ); \
void (*flush_stream)( struct colm_program *prg, struct _stream_impl *si ); \
@@ -99,7 +101,7 @@ struct stream_funcs \
void (*print_tree)( struct colm_program *prg, struct colm_tree **sp, \
struct _stream_impl *impl, struct colm_tree *tree, int trim ); \
struct stream_impl *(*split_consumed)( struct colm_program *prg, struct _stream_impl *si ); \
- int (*append_data)( struct colm_program *prg, struct _stream_impl *si, const char *data, int len ); \
+ int (*append_data)( struct colm_program *prg, struct _stream_impl *si, const alph_t *data, int len ); \
int (*undo_append_data)( struct colm_program *prg, struct _stream_impl *si, int length ); \
void (*destructor)( struct colm_program *prg, struct colm_tree **sp, struct _stream_impl *si ); \
int (*get_option)( struct colm_program *prg, struct _stream_impl *si, int option ); \
@@ -165,7 +167,7 @@ struct run_buf
/* Must be at the end. We will grow this struct to add data if the input
* demands it. */
- char data[FSM_BUFSIZE];
+ alph_t data[FSM_BUFSIZE];
};
struct run_buf *new_run_buf( int sz );
@@ -180,7 +182,7 @@ struct stream_impl_data
struct run_buf *tail;
} queue;
- const char *data;
+ const alph_t *data;
long dlen;
int offset;
@@ -220,7 +222,7 @@ struct colm_stream *colm_stream_open_collect( struct colm_program *prg );
char *colm_filename_add( struct colm_program *prg, const char *fn );
struct stream_impl *colm_impl_new_accum( char *name );
struct stream_impl *colm_impl_consumed( char *name, int len );
-struct stream_impl *colm_impl_new_text( char *name, const char *data, int len );
+struct stream_impl *colm_impl_new_text( char *name, const alph_t *data, int len );
#ifdef __cplusplus
}
diff --git a/colm/keyops.h b/colm/keyops.h
index 094b09e2..ed58db8d 100644
--- a/colm/keyops.h
+++ b/colm/keyops.h
@@ -89,6 +89,7 @@ struct HostType
{
const char *data1;
const char *data2;
+ bool isSigned;
long long minVal;
long long maxVal;
unsigned int size;
@@ -98,8 +99,7 @@ struct HostLang
{
HostType *hostTypes;
int numHostTypes;
- HostType *defaultAlphType;
- bool explicitUnsigned;
+ int defaultHostType;
};
extern HostLang *hostLang;
@@ -113,9 +113,9 @@ struct KeyOps
KeyOps() : alphType(0) {}
Key minKey, maxKey;
- HostType *alphType;
+ const HostType *alphType;
- void setAlphType( HostType *alphType )
+ void setAlphType( const HostType *alphType )
{
this->alphType = alphType;
minKey = (long) alphType->minVal;
diff --git a/colm/main.cc b/colm/main.cc
index 501ae16b..e7cbd5e9 100644
--- a/colm/main.cc
+++ b/colm/main.cc
@@ -103,12 +103,21 @@ void version();
/* Total error count. */
int gblErrorCount = 0;
+/*
+ * Alphabet Type for the parsing machinery. The trees/strings of parsed data
+ * all use char type. Currently we can support signed char, unsigned char or
+ * char. If changing this, the colm_alph_t type needs to change as well.
+ * Currently, this is a compile time change only. A colm binary currently
+ * connot be made to work with multiple alphabet types.
+ */
+
HostType hostTypesC[] =
{
- { "char", 0, CHAR_MIN, CHAR_MAX, sizeof(char) },
+ { "unsigned", "char", false, 0, UCHAR_MAX, sizeof(unsigned char) },
};
-HostLang hostLangC = { hostTypesC, 8, hostTypesC+0, true };
+
+HostLang hostLangC = { hostTypesC, 1, 0 };
HostLang *hostLang = &hostLangC;
/* Print the opening to an error in the input, then return the error ostream. */
diff --git a/colm/pdacodegen.cc b/colm/pdacodegen.cc
index 5e069998..d6435ea9 100644
--- a/colm/pdacodegen.cc
+++ b/colm/pdacodegen.cc
@@ -397,7 +397,7 @@ void PdaCodeGen::writeRuntimeData( colm_sections *runtimeData, struct pda_tables
out << "static const char *" << litdata() << "[] = {\n";
for ( int i = 0; i < runtimeData->num_literals; i++ ) {
out << "\t\"";
- escapeLiteralString( out, runtimeData->litdata[i] );
+ escapeLiteralString( out, runtimeData->litdata[i], runtimeData->litlen[i] );
out << "\",\n";
}
out << "};\n\n";
diff --git a/colm/pdarun.c b/colm/pdarun.c
index 59b8a947..b2c2ca66 100644
--- a/colm/pdarun.c
+++ b/colm/pdarun.c
@@ -112,7 +112,7 @@ head_t *colm_stream_pull( program_t *prg, tree_t **sp, struct pda_run *pda_run,
pda_run->consume_buf = run_buf;
}
- char *dest = run_buf->data + run_buf->length;
+ alph_t *dest = run_buf->data + run_buf->length;
is->funcs->get_data( prg, is, dest, length );
location_t *loc = location_allocate( prg );
@@ -123,14 +123,14 @@ head_t *colm_stream_pull( program_t *prg, tree_t **sp, struct pda_run *pda_run,
pda_run->p = pda_run->pe = 0;
pda_run->tokpref = 0;
- head_t *tokdata = colm_string_alloc_pointer( prg, dest, length );
+ head_t *tokdata = colm_string_alloc_pointer( prg, colm_cstr_from_alph( dest ), length );
tokdata->location = loc;
return tokdata;
}
else {
head_t *head = init_str_space( length );
- char *dest = (char*)head->data;
+ alph_t *dest = (alph_t*)head->data;
is->funcs->get_data( prg, is, dest, length );
location_t *loc = location_allocate( prg );
@@ -143,7 +143,7 @@ head_t *colm_stream_pull( program_t *prg, tree_t **sp, struct pda_run *pda_run,
void colm_stream_push_text( struct colm_program *prg, struct input_impl *is, const char *data, long length )
{
- is->funcs->prepend_data( prg, is, data, length );
+ is->funcs->prepend_data( prg, is, colm_alph_from_cstr( data ), length );
}
void colm_stream_push_tree( struct colm_program *prg, struct input_impl *is, tree_t *tree, int ignore )
@@ -170,7 +170,7 @@ void colm_undo_stream_push( program_t *prg, tree_t **sp, struct input_impl *is,
/* Should only be sending back whole tokens/ignores, therefore the send back
* should never cross a buffer boundary. Either we slide back data, or we move to
* a previous buffer and slide back data. */
-static void send_back_text( struct colm_program *prg, struct input_impl *is, const char *data, long length )
+static void send_back_text( struct colm_program *prg, struct input_impl *is, const alph_t *data, long length )
{
//debug( REALM_PARSE, "push back of %ld characters\n", length );
@@ -209,7 +209,7 @@ static void send_back_ignore( program_t *prg, tree_t **sp,
if ( artificial )
send_back_tree( prg, is, parse_tree->shadow->tree );
else
- send_back_text( prg, is, string_data( head ), head->length );
+ send_back_text( prg, is, colm_alph_from_cstr( string_data( head ) ), head->length );
}
colm_decrement_steps( pda_run );
@@ -278,7 +278,7 @@ static void send_back( program_t *prg, tree_t **sp, struct pda_run *pda_run,
}
/* Push back the token data. */
- send_back_text( prg, is, string_data( parse_tree->shadow->tree->tokdata ),
+ send_back_text( prg, is, colm_alph_from_cstr( string_data( parse_tree->shadow->tree->tokdata ) ),
string_length( parse_tree->shadow->tree->tokdata ) );
/* If eof was just sent back remember that it needs to be sent again. */
@@ -377,7 +377,7 @@ kid_t *make_token_with_data( program_t *prg, struct pda_run *pda_run,
for ( i = 0; i < lel_info[id].num_capture_attr; i++ ) {
CaptureAttr *ca = &prg->rtd->capture_attr[lel_info[id].capture_attr + i];
head_t *data = string_alloc_full( prg,
- pda_run->mark[ca->mark_enter],
+ colm_cstr_from_alph( pda_run->mark[ca->mark_enter] ),
pda_run->mark[ca->mark_leave] -
pda_run->mark[ca->mark_enter] );
tree_t *string = construct_string( prg, data );
@@ -736,9 +736,9 @@ static head_t *extract_match( program_t *prg, tree_t **sp,
pda_run->consume_buf = run_buf;
}
- char *dest = run_buf->data + run_buf->length;
+ alph_t *dest = run_buf->data + run_buf->length;
- is->funcs->get_data( prg, is, dest, length );
+ is->funcs->get_data( prg, is, (alph_t*)dest, length );
location_t *location = location_allocate( prg );
is->funcs->consume_data( prg, is, length, location );
@@ -748,7 +748,7 @@ static head_t *extract_match( program_t *prg, tree_t **sp,
pda_run->tokpref = 0;
pda_run->tokstart = 0;
- head_t *head = colm_string_alloc_pointer( prg, dest, length );
+ head_t *head = colm_string_alloc_pointer( prg, colm_cstr_from_alph( dest ), length );
head->location = location;
@@ -793,7 +793,7 @@ static head_t *extract_no_l( program_t *prg, tree_t **sp,
pda_run->consume_buf = run_buf;
}
- char *dest = run_buf->data + run_buf->length;
+ alph_t *dest = run_buf->data + run_buf->length;
is->funcs->get_data( prg, is, dest, length );
@@ -808,7 +808,7 @@ static head_t *extract_no_l( program_t *prg, tree_t **sp,
pda_run->tokpref = 0;
pda_run->tokstart = 0;
- head_t *head = colm_string_alloc_pointer( prg, dest, length );
+ head_t *head = colm_string_alloc_pointer( prg, colm_cstr_from_alph( dest ), length );
/* Don't pass the location. */
head->location = 0;
@@ -849,14 +849,14 @@ static head_t *peek_match( program_t *prg, struct pda_run *pda_run, struct input
pda_run->consume_buf = run_buf;
}
- char *dest = run_buf->data + run_buf->length;
+ alph_t *dest = run_buf->data + run_buf->length;
is->funcs->get_data( prg, is, dest, length );
pda_run->p = pda_run->pe = 0;
pda_run->tokpref = 0;
- head_t *head = colm_string_alloc_pointer( prg, dest, length );
+ head_t *head = colm_string_alloc_pointer( prg, colm_cstr_from_alph( dest ), length );
head->location = location_allocate( prg );
is->funcs->transfer_loc( prg, head->location, is );
@@ -1093,7 +1093,7 @@ static long scan_token( program_t *prg, struct pda_run *pda_run, struct input_im
return SCAN_UNDO;
while ( true ) {
- char *pd = 0;
+ alph_t *pd = 0;
int len = 0;
int tokpref = pda_run->tokpref;
int type = is->funcs->get_parse_block( prg, is, &tokpref, &pd, &len );
diff --git a/colm/pdarun.h b/colm/pdarun.h
index 3cee2124..27e075c9 100644
--- a/colm/pdarun.h
+++ b/colm/pdarun.h
@@ -268,18 +268,18 @@ struct pda_run
long region, pre_region;
long fsm_cs, next_cs, act;
- char *start;
- char *tokstart;
+ alph_t *start;
+ alph_t *tokstart;
long tokend;
long tokpref;
- char *p, *pe;
+ alph_t *p, *pe;
char scan_eof;
char return_result;
char skip_tokpref;
char eof_term_recvd;
- char *mark[MARK_SLOTS];
+ alph_t *mark[MARK_SLOTS];
long matched_token;
/*
diff --git a/colm/print.c b/colm/print.c
index 2b69e86a..363a7eea 100644
--- a/colm/print.c
+++ b/colm/print.c
@@ -59,7 +59,7 @@ static void xml_escape_data( struct colm_print_args *print_args, const char *dat
void init_str_collect( str_collect_t *collect )
{
- collect->data = (char*) malloc( BUFFER_INITIAL_SIZE );
+ collect->data = malloc( BUFFER_INITIAL_SIZE );
collect->allocated = BUFFER_INITIAL_SIZE;
collect->length = 0;
collect->indent.indent = 0;
@@ -76,7 +76,7 @@ void str_collect_append( str_collect_t *collect, const char *data, long len )
long new_len = collect->length + len;
if ( new_len > collect->allocated ) {
collect->allocated = new_len * 2;
- collect->data = (char*) realloc( collect->data, collect->allocated );
+ collect->data = realloc( collect->data, collect->allocated );
}
memcpy( collect->data + collect->length, data, len );
collect->length += len;
@@ -91,7 +91,7 @@ void str_collect_clear( str_collect_t *collect )
void print_str( struct colm_print_args *print_args, head_t *str )
{
- print_args->out( print_args, (char*)(str->data), str->length );
+ print_args->out( print_args, str->data, str->length );
}
void append_collect( struct colm_print_args *args, const char *data, int length )
@@ -128,7 +128,7 @@ restart:
}
}
else {
- char *nl;
+ const char *nl;
if ( args->indent->level != COLM_INDENT_OFF &&
(nl = memchr( data, '\n', length )) )
{
@@ -559,7 +559,7 @@ static void xml_term( program_t *prg, tree_t **sp,
else if ( kid->tree->id == LEL_ID_STR ) {
head_t *head = (head_t*) ((str_t*)kid->tree)->value;
- xml_escape_data( print_args, (char*)(head->data), head->length );
+ xml_escape_data( print_args, head->data, head->length );
}
else if ( 0 < kid->tree->id && kid->tree->id < prg->rtd->first_non_term_id &&
kid->tree->id != LEL_ID_IGNORE &&
@@ -720,7 +720,6 @@ static void postfix_close( program_t *prg, tree_t **sp,
sprintf( buf, " %d", children );
args->out( args, buf, strlen( buf ) );
-
args->out( args, "\n", 1 );
}
}
diff --git a/colm/stream.c b/colm/stream.c
index 0aa836dd..abe1b636 100644
--- a/colm/stream.c
+++ b/colm/stream.c
@@ -166,7 +166,7 @@ struct run_buf *new_run_buf( int sz )
}
/* Keep the position up to date after consuming text. */
-void update_position_data( struct stream_impl_data *is, const char *data, long length )
+void update_position_data( struct stream_impl_data *is, const alph_t *data, long length )
{
int i;
for ( i = 0; i < length; i++ ) {
@@ -184,7 +184,7 @@ void update_position_data( struct stream_impl_data *is, const char *data, long l
}
/* Keep the position up to date after sending back text. */
-void undo_position_data( struct stream_impl_data *is, const char *data, long length )
+void undo_position_data( struct stream_impl_data *is, const alph_t *data, long length )
{
/* FIXME: this needs to fetch the position information from the parsed
* token and restore based on that.. */
@@ -221,7 +221,7 @@ static void data_transfer_loc( struct colm_program *prg, location_t *loc,
*/
static int data_get_data( struct colm_program *prg, struct stream_impl_data *ss,
- char *dest, int length )
+ alph_t *dest, int length )
{
int copied = 0;
@@ -249,7 +249,7 @@ static int data_get_data( struct colm_program *prg, struct stream_impl_data *ss,
/* Anything available in the current buffer. */
if ( avail > 0 ) {
/* The source data from the current buffer. */
- char *src = &buf->data[buf->offset];
+ alph_t *src = &buf->data[buf->offset];
int slen = avail < length ? avail : length;
memcpy( dest+copied, src, slen ) ;
@@ -280,7 +280,7 @@ static struct stream_impl *data_split_consumed( program_t *prg, struct stream_im
}
int data_append_data( struct colm_program *prg, struct stream_impl_data *sid,
- const char *data, int length )
+ const alph_t *data, int length )
{
struct run_buf *tail = sid->queue.tail;
if ( tail == 0 || length > (FSM_BUFSIZE - tail->length) ) {
@@ -414,7 +414,7 @@ static void data_print_tree( struct colm_program *prg, tree_t **sp,
}
static int data_get_parse_block( struct colm_program *prg, struct stream_impl_data *ss,
- int *pskip, char **pdp, int *copied )
+ int *pskip, alph_t **pdp, int *copied )
{
int ret = 0;
*copied = 0;
@@ -448,7 +448,7 @@ static int data_get_parse_block( struct colm_program *prg, struct stream_impl_da
/* Anything available in the current buffer. */
if ( avail > 0 ) {
/* The source data from the current buffer. */
- char *src = &buf->data[buf->offset];
+ alph_t *src = &buf->data[buf->offset];
/* Need to skip? */
if ( *pskip > 0 && *pskip >= avail ) {
@@ -523,9 +523,9 @@ static int data_consume_data( struct colm_program *prg, struct stream_impl_data
}
static int data_undo_consume_data( struct colm_program *prg, struct stream_impl_data *sid,
- const char *data, int length )
+ const alph_t *data, int length )
{
- const char *end = data + length;
+ const alph_t *end = data + length;
int amount = length;
if ( amount > sid->consumed )
amount = sid->consumed;
@@ -571,7 +571,7 @@ static int data_undo_consume_data( struct colm_program *prg, struct stream_impl_
*/
static int file_get_data_source( struct colm_program *prg, struct stream_impl_data *si,
- char *dest, int length )
+ alph_t *dest, int length )
{
return fread( dest, 1, length, si->file );
}
@@ -581,7 +581,7 @@ static int file_get_data_source( struct colm_program *prg, struct stream_impl_da
*/
static int accum_get_data_source( struct colm_program *prg, struct stream_impl_data *si,
- char *dest, int want )
+ alph_t *dest, int want )
{
long avail = si->dlen - si->offset;
long take = avail < want ? avail : want;
@@ -710,14 +710,14 @@ struct stream_impl *colm_impl_consumed( char *name, int len )
return (struct stream_impl*)si;
}
-struct stream_impl *colm_impl_new_text( char *name, const char *data, int len )
+struct stream_impl *colm_impl_new_text( char *name, const alph_t *data, int len )
{
struct stream_impl_data *si = (struct stream_impl_data*)
malloc(sizeof(struct stream_impl_data));
si_data_init( si, name );
si->funcs = (struct stream_funcs*)&accum_funcs;
- char *buf = (char*)malloc( len );
+ alph_t *buf = (alph_t*)malloc( len );
memcpy( buf, data, len );
si->data = buf;
@@ -778,7 +778,7 @@ stream_t *colm_stream_open_file( program_t *prg, tree_t *name, tree_t *mode )
}
/* Need to make a C-string (null terminated). */
- char *file_name = (char*)malloc(string_length(head_name)+1);
+ char *file_name = malloc(string_length(head_name)+1);
memcpy( file_name, string_data(head_name), string_length(head_name) );
file_name[string_length(head_name)] = 0;
diff --git a/colm/string.c b/colm/string.c
index 8a852e8b..d1d16aa9 100644
--- a/colm/string.c
+++ b/colm/string.c
@@ -273,9 +273,9 @@ head_t *make_literal( program_t *prg, long offset )
head_t *string_sprintf( program_t *prg, str_t *format, long integer )
{
head_t *format_head = format->value;
- long written = snprintf( 0, 0, string_data(format_head), integer );
+ long written = snprintf( 0, 0, (char*)string_data(format_head), integer );
head_t *head = init_str_space( written+1 );
- written = snprintf( (char*)head->data, written+1, string_data(format_head), integer );
+ written = snprintf( (char*)head->data, written+1, (char*)string_data(format_head), integer );
head->length -= 1;
return head;
}