# # # This is somewhat broken. missing_close_id is cuasing close ids to be parseed # when they shouldn't. Maybe remove it. # # context tags # # Regular Definitions # rl def_name_char /[\-A-Za-z0-9._:?]/ rl def_name /[A-Za-z_:] def_name_char*/ rl def_system_literal /'"' [^"]* '"' | "'" [^']* "'"/ # # Scanner for tag names. # lex ignore /space+/ token tag_id /def_name/ end # # Scanner for attributes names # lex ignore /space+/ token attr_name /def_name_char+/ literal `= end literal `> `/> # Scanner for attribute values. lex ignore /space+/ token dquote_val /'"' ([^"] | '\\' any)* '"'/ token squote_val /"'" ([^'] | '\\' any)* "'"/ token unq_val /[^ \t\r\n<>"'] [^ \t\r\n<>]*/ end # # Tokens # lex ignore /space+/ literal `< `'/ token doc_data /[^<]+/ token comment /''/ end # # Tags # bool inTagStack( id: str ) { LocalTagStack: tag_stack = TagStack for Tag: tag_id in LocalTagStack { if id == Tag.data return true } return false } # This scanner is just for the id in close tags. The id needs to be looked up # in the tag stack so we can determine if it is a stray. lex # Ignore whitespace. ignore /space+/ token stray_close_id // token missing_close_id // token close_id /def_name/ { # If it is in the tag stack then it is a close_id. If not then it's a # stray_close_id. send_id: int = typeid if ( inTagStack( match_text ) ) { print[ 'CLOSE \'' match_text '\' IN TAG STACK\n' ] # The tag is in the stack, send missing close tags until we get to it. match TagStack [Top:tag_id Rest:tag_stack] TagStack = Rest while ( Top.data != match_text ) { print( 'SENDING missing close\n' ) input->push( make_token( typeid, '' ) ) match TagStack [Top2:tag_id Rest2:tag_stack] Top = Top2 TagStack = Rest2 } print[ 'SENDING close\n' ] input->push( make_token( typeid, input->pull( match_length ) ) ) } else { print[ 'CLOSE \'' match_text '\' NOT IN TAG STACK\n' ] # The tag is not in the tag stack so send the id as a stray close. input->push( make_token( typeid, input->pull( match_length ) ) ) } } end # # Tag Stack # def tag_stack [tag_id tag_stack] | [] TagStack: tag_stack # # Document Type # # This scanner handles inside DOCTYPE tags (except keywords). lex ignore /space+/ token dt_name /def_name/ token dt_literal /def_system_literal/ token dt_bl /"[" [^\]]* "]"/ token dt_close /'>'/ end # Using a separate scanner for the keywords in DOCTYPE prevents them from # covering dt_name lex ignore /space+/ literal `SYSTEM `PUBLIC end def DOCTYPE [`] { TagStack = construct tag_stack [r2 TagStack] } # # Empty tags # def empty_tag [`< tag_id attr* `/>] # # Stray close tags # def stray_close [close_tag] # # Attributes # def attr [attr_name eql_attr_val?] def eql_attr_val [`= attr_val] def attr_val [squote_val] | [dquote_val] | [unq_val] | [] # # Items # def item [DOCTYPE] | [tag] | [unclosed_tag] | [empty_tag] | [stray_close] | [doc_data] | [comment] token trailing /any*/ def start [item* trailing] # # END GRAMMAR # int addDefaultAltTags( Start: ref ) { for T: open_tag in Start { require T ["'] haveAlt: bool = false for A: attr in T { if match A ["alt=" attr_val] haveAlt = true } if !haveAlt { for AL: attr* in T { if match AL [] { AL = construct attr* [" alt=\"default alt\""] break } } } } } int printLinks( Start: start ) { for A:tag in Start { require A ["" I: item* ""] for Attr: attr in AttrList { if match Attr ["href = " AttrVal: attr_val] print[ 'link: ' I '\ntarget: ' AttrVal '\n\n' ] } } } bool should_close( TI: tag_id ) { return true } bool should_flatten( TI: tag_id ) { return true } end # tags # Finds unclosed tags and puts the content after the tag. Afterwards # all unclosed tags will be empty 'inside'. #int flatten( Start: ref ) #{ # for TL: item* in Start { # require TL # [OT: open_tag Inside: item* Trailing: item*] # # match OT # ['<' TagId: tag_id attr* '>'] # # if should_flatten( TagId ) # { # require Inside # [item item*] # # # Put Trailing at the end of inside. # for END: item* in Inside { # if match END [] { # END = Trailing # break # } # } # # str empty = '' # missing_close_id Missing = construct missing_close_id [empty] # opt_close_tag EmptyCloseTag = # construct opt_close_tag [Missing] # # # Close the tag and put inside after it. # TL = construct item* # [OT EmptyCloseTag Inside] # } # } #} # #int close( Start: ref ) #{ # for TL: item in Start { # require TL # [OpenTag: open_tag Inside: item*] # # match OpenTag # ['<' TagId: tag_id attr* '>'] # # if should_close( TagId ) # { # close_id CloseId = construct close_id # [TagId.data] # # opt_close_tag CloseTag = # construct opt_close_tag [''] # # # Close the tag and put inside after it. # TL = construct item # [OpenTag Inside CloseTag] # } # } #} Tags: tags = new tags() Tags->TagStack = construct tags::tag_stack [] parse HTML: tags::start(Tags)[ stdin ] print( HTML ) #print_xml( HTML ) #for C: close_tag in HTML # print( C '\n' ) ##### IN ##### &FOO ##### EXP ##### &FOO