As the first step in the decommissioning of sasCommunity.org the site has been converted to read-only mode.


Here are some tips for How to share your SAS knowledge with your professional network.


A Tagset for Parsing RTF

From sasCommunity
Jump to: navigation, search

Tagsets are created using Proc TEMPLATE. They are used in conjunction with ODS to generate SAS output that conforms to a specific destination format such as rtf, ExcelXP, OpenOfficeXML, etc... SAS Users can extend existing tagsets via Copy/Edit, Parent= statement so that their new tagset will respond to previously unhandled events, or have special processing specific to a task.

The tagset programming environment is rich in process flow control elements, has a range of types and the ability to use any SAS function that SQL allows. Two important types are dictionary-variables and list-variables. Dictionary-variables use a string value as a key, and List-Variables use a number.

  • set $MyDictionary['color'] 'red'
  • set $MyList[] 'James' [] must be used to add items
  • set $MyList[1] 'Jimmy' [list-index] must be used to replace an item

Note that despite numeric indexing, List-variables are not arrays.

A unique feature of tagsets is the capacity for event driven processing via the DEFINE EVENT and TRIGGER statements. The features of tagset programming can be utilized to perform an operational task totally unrelated to handling output delivered by SAS procedures.

The tagset environment does not have a visual debugger and the programmer has to resort to PUTLOG statements to track program flow during development.

The parser and the translator

The following tagset, eeRtfToHtmlTranslator, was written to convert SAS source code saved from Enhanced Editor as RTF (eeRtf) (as shown in this article) into HTML.

Some hightlights to examine:

  • regular expressions
  • external file input (filename,fopen,fread,fget,fclose)
  • TRIGGERing events that do not have an DEFINEd EVENT handler
    • This allows children to easy implement extended capabilities
  • extension via PARENT=
  • Two tagsets
    • eeRtfParser, not a complete parser for arbitrary rtf but will handle eeRtf
    • eeRtfToHtmlTranslator, extends the parser by implementing DEFINE EVENT tokenHandler


ods path 
  sasuser.templat (update)
  sashelp.tmplmst (read)
;
 
proc template;
  define tagset
    tagsets.eeRtfParser / store=sasuser.templat;
 
    define event
    doc;
      start:
 
      eval $rxhex prxparse("/^''([0-9a-f]{2})/");
 
      eval $rxwrd prxparse("/^([a-z]{1,32})(-?\d+)?(\s|[^a-z0-9])$/");
 
      trigger main start;
      trigger main finish;
    end;
 
    define event
    main;
     start:
      set $rtfFile $options['RTFFILE'];
 
      do / if not $rtfFile;
        putlog 'WARNING: RtfFile not specified.';
        break;
      done;
 
      set $fileref 'rtf_src';
      eval $rcRtfRef filename($fileref,$rtfFile);
      do / if $rcRtfRef ne 0;
        set $msg sysmsg();
        putlog $msg;
        break;
      done;
 
      trigger pre_parse_start;
      trigger parser start;
      trigger parser finish;
      trigger post_parse_finish;
 
     finish:
      break / if not $rtfFile;
 
      do / if $rcRtfRef ne 0;
        eval $rc filename($fileref,'');
      done;
    end;
 
    define event
    parser;
     start:
      eval $fid fopen($fileref,'I',1,'B');
      do / if $fid = 0;
        set $msg sysmsg();
        putlog $msg;
        break;
      done;
 
      set $state 'text';
      set $state2 '';
      set $inChar '00'x;
      eval $tokenCount 0;
      eval $reuseChar 0;
 
      set $token '';
 
      putlog 'Reading ' $rtfFile;
 
      do / while fread($fid) = 0;
        set $prevChar $inChar;
        set $rc fget($fid,$inChar,1);
 
        trigger processChar;
        trigger processChar / if $reuseChar;
 
        eval $reuseChar 0;
      done;
 
     finish:
      do / if $fid > 0;
        eval $rc fclose($fid);
        do / if $rc ne 0;
          set $msg sysmsg();
          putlog $msg;
        done;
      done;
    end;
 
    define event
    processChar;
      break / if cmp ('0D'x, $inChar);
      break / if cmp ('0A'x, $inChar);
 
      do / if cmp ('text', $state);
        do 
        / if cmp ('\', $inChar);
          trigger token;
          set $state 'control';
        else 
        / if cmp ('{', $inChar);
          trigger token;
          set $state 'group';
          set $token 'open';
          trigger token;
          set $state 'text';
        else
        / if cmp ('}', $inChar);
          trigger token;
          set $state 'group';
          set $token 'close';
          trigger token;
          set $state 'text';
        else;
          set $token cat($token,$inChar);
        done;
 
        break;
      done;
 
      do / if cmp ('control', $state);
        do
        / if indexc ($prevChar, '0D0A'x);
          trigger token;
          set $state 'text';
          eval $reuseChar 1;
          break;
        done;
 
        set $token cat($token,$inChar);
 
        do 
        / if cmp ('*', $token);
          trigger token;
          set $state 'text';
        else
        / if indexc ($token, "-_~:|{}\") = 1; 
          set $state 'escape';
          trigger token;
          set $state  'text';
        else
        / if prxmatch ($rxhex, $token);
          set $state 'hex';
          trigger token;
          set $state  'text';
        else
        / if prxmatch ($rxwrd, $token);
          set $param prxposn ($rxwrd, 2, $token);
          set $token prxposn ($rxwrd, 1, $token);
          trigger token;
          set $state 'text';
 
          do / if NOT cmp(' ',$inChar);
            eval $reuseChar 1;
          done;
        done;
 
        break;
      done;
    end;
 
    define event
    token;
      trigger tokenHandler / if exists($token);
 
      unset $state;
      unset $token;
      unset $param;
    end;
 
  end; * tagset;
 
%*-------------------------------------------------------------;
 
  %* Note: aspects related to font name and size ignored
  %*  because source content is (assumed) SAS source code (as .rtf) 
  %*  and target output is html;
 
  define tagset
    tagsets.eeRtfToHtmlTranslator / store=sasuser.templat;
 
    parent = tagsets.eeRtfParser;
 
    define event
    pre_parse_start;
*     putlog 'prestart';
      put '<HTML>' nl;
      eval $depth 0;
      set $fonttbl_state 'undefined';
      set $colortbl_state 'undefined';
      set $head_state 'undefined';
      set $style_state 'undefined';
      set $body_state 'undefined';
    end;
 
    define event
    post_parse_finish;
*     putlog 'postfini';
      put '</HTML>';
    end;
 
    define event
    tokenHandler;
      unset $paramNum;
      eval $paramNum inputn($param,'best12.');
 
*     break / if $tokenCount > 120;
      eval $tokenCount $tokenCount + 1;
*     putlog $state ' ' $token ' ' $param ;
 
      do 
      / if cmp('group',$state) and cmp('open',$token);
        trigger push_state;
      else
      / if cmp('group',$state) and cmp('close',$token);
        trigger pop_state;
      else
      / if cmp('control',$state) 
       and cmp('colortbl',$token) 
       and cmp('undefined',$colortbl_state);
        trigger colortbl_open;
      else
      /  if cmp('opened',$colortbl_state);
        trigger rgb_accrete;
      else
      / if cmp('control',$state) 
       and cmp('fonttbl',$token) 
       and cmp('undefined',$fonttbl_state);
        trigger fonttbl_open;
      else
      / if cmp('opened',$fonttbl_state);
        trigger font_accrete;
      else
      / if cmp('control',$state);
        trigger handle_control;
      else
      / if cmp('escape',$state);
        trigger handle_escape;
      else
      / if cmp('text',$state);
        trigger put_text;
      done;
    end;
 
    define event
    fonttbl_open;
      set $fonttbl_state 'opened';
 
      eval $close_tbl_depth $depth;
      set $close_tbl $token;
    end;
 
    define event
    fonttbl_close;
      set $fonttbl_state 'closed';
 
      break / if NOT $fonts;
/*
      trigger style_open;
 
      iterate $fonts;
      do / while _NAME_;
        put 'span.f' _NAME_ ' {font-family:';
        putq _VALUE_;
        put '}' nl;
        next $fonts;
      done;
 
      trigger style_close;
*/
      unset $fonts;
    end;
 
    define event
    font_accrete;
      do 
      / if cmp('control',$state) and cmp('f',$token);
        set $fontkey $param;
      else
      / if cmp('control',$state) ;
        * rtf font-family ;
      else
      / if cmp('text',$state) and contains($token,';');
        * semi should be last char;
        set $fontname scan($token,1,';');
        set $fonts[$fontkey] $fontname;
        unset $fontname;
      done;
    end;
 
    define event
    colortbl_open;
      set $colortbl_state 'opened';
 
      eval $close_tbl_depth $depth;
      set $close_tbl $token;
 
      trigger rgb_reset;
    end;
 
    define event
    colortbl_close;
      set $colortbl_state 'closed';
      unset $r;
      unset $g;
      unset $b;
 
      break / if NOT $colors;
 
      trigger style_open;
 
      eval $index 1;
      do / while $index <= $colors;
        eval $index0 $index-1;
        set  $index0 '0' / if $index = 1;
 
        put 'span.cf' $index0 ' {color:     ' $colors[$index] '}' nl;
        put 'span.cb' $index0 ' {background:' $colors[$index] '}' nl;
        eval $index $index+1;
      done;
 
      trigger style_close;
    end;
 
    define event
    rgb_reset;
      eval $r 0;
      eval $g 0;
      eval $b 0;
      unset $hexvalue;
      unset $hexstring;
      unset $cssColor;
    end;
 
    define event
    rgb_accrete;
      do 
      / if cmp('control',$state) and cmp('red',$token);
        eval $r BAND($paramNum,255);
      else 
      / if cmp('control',$state) and cmp('green',$token);
        eval $g BAND($paramNum,255);
      else 
      / if cmp('control',$state) and cmp('blue',$token);
        eval $b BAND($paramNum,255);
      else
      / if cmp('text',$state) and cmp(';',$token);
        eval $hexvalue blshift($r,16) + blshift($g,8) + $b;
        set $hexstring putn($hexvalue,'hex6.');
 
        set $cssColor cat('#',$hexstring);
        set $colors[] $cssColor;
 
        trigger rgb_reset;
      done;
    end;
 
    define event
    head_open;
      break / if NOT cmp ('undefined',$head_state);
 
      set $head_state 'opened';
      put '<HEAD>' nl;
      trigger style_open;
      put 'span.u {text-decoration: underline}' nl;
      put 'span.b {font-weight:bold}' nl;
      put 'span.i {font-style:italic}' nl;
      trigger style_close;
    end;
 
    define event
    head_close;
      break / if NOT cmp ('opened',$head_state);
 
      set $head_state 'closed';
      put '</HEAD>' nl;
    end;
 
    define event
    body_open;
      break / if NOT cmp ('undefined',$body_state);
 
      trigger head_close;
 
      set $body_state 'opened';
      put '<BODY><PRE>' nl;
    end;
 
    define event
    body_close;
      break / if NOT cmp ('opened',$body_state);
 
      set $body_state 'closed';
      put '</PRE></BODY>' nl;
    end;
 
    define event
    style_open;
      trigger head_open;
 
      break / if NOT cmp ('opened',$head_state);
      break / if cmp ('opened',$style_state);
 
      set $style_state 'opened';
      put '<STYLE type="text/css">' nl;
    end;
 
    define event
    style_close;
      break / if NOT cmp ('opened',$style_state);
 
      set $style_state 'closed';
      put '</STYLE>' nl;
    end;
 
    define event
    push_state;
      eval $depth $depth+1;
*putlog 'push depth' '=' $depth;
    end;
 
    define event
    pop_state;
*putlog 'pop  depth' '=' $depth ;
 
      do / if $depth = $close_tbl_depth;
        trigger fonttbl_close  / if cmp ('fonttbl',  $close_tbl);
        trigger colortbl_close / if cmp ('colortbl', $close_tbl);
      done;
 
      eval $depth $depth-1;
 
      trigger body_close / if $depth = 0;
    end;
 
    define event
    handle_escape;
      set $output htmlencode($token);
      put $output;
      unset $output;
    end;
 
    define event
    handle_control;
      do 
/*
      / if cmp('f',$token);
        trigger f_control;
      else
      / if cmp('fs',$token);
        trigger fs_control;
      else
*/
      / if cmp('par',$token);
        trigger par_control;
      else
      / if cmp('cf',$token);
        trigger cf_control;
      else
      / if cmp('cb',$token);
        trigger cb_control;
      else
      / if cmp('chcbpat',$token);
        trigger chcbpat_control;
      else
      / if cmp('ul',$token);
        trigger ul_control;
      else
      / if cmp('b',$token);
        trigger b_control;
      else
      / if cmp('i',$token);
        trigger i_control;
      done; 
    end;
/*
    define event f_control;
      set $fontkey $param;
    end;
 
    define event fs_control;
      eval $fontsize $paramNum / 2;
    end;
*/
    define event 
    par_control;
      put nl;
    end;
 
    define event 
    cf_control;
      set $foregroundColorKey $param;
    end;
 
    define event 
    cb_control;
      set $backgroundColorKey $param;
    end;
 
    define event
    chcbpat_control;
      eval $backgroundColorKey $param;
    end;    
 
    define event 
    ul_control;
      do 
      / if NOT $param;
        eval $underlineState 'on';
      else
      / if $paramNum = 0;
        eval $underlineState 'off';
      else;
        eval $underlineState 'on';
      done;
    end;
 
    define event 
    b_control;
      do 
      / if NOT $param;
        eval $boldState 'on';
      else
      / if $paramNum = 0;
        eval $boldState 'off';
      else;
        eval $boldState 'on';
      done;
    end;
 
    define event 
    i_control;
      do 
      / if NOT $param;
        eval $italicState 'on';
      else
      / if $paramNum = 0;
        eval $italicState 'off';
      else;
        eval $italicState 'on';
      done;
    end;
 
    define event
    put_text;
 
      trigger body_open;
 
      trigger evaluate_state;
 
      set $output htmlencode($token);
      put $output;
 
      unset $output;
    end;
 
    define event 
    save_state;
      set $fgColors[]  $foregroundColorKey;
      set $bgColors[]  $backgroundColorKey;
      set $italic[]    $italicState;
      set $bold[]      $boldState;
      set $underline[] $underlineState;
 
      unset $dlm;
      set $dlm '';
      put '<SPAN CLASS="';
      do / if $foregroundColorKey;
        put $dlm 'cf' $foregroundColorKey;
        set $dlm ' ';
      done;
      do / if $backgroundColorKey;
        put $dlm 'cb' $backgroundColorKey;
        set $dlm ' ';
      done;
      do / if cmp('on',$italicState);
        put $dlm 'i';
        set $dlm ' ';
      done;
      do / if cmp('on',$boldState);
        put $dlm 'b';
        set $dlm ' ';
      done;
      do / if cmp('on',$underlineState);
        put $dlm 'u';
        set $dlm ' ';
      done;
      put '">';
    end;
 
    define event 
    evaluate_state; 
      do / if NOT $fgColors;
        trigger save_state;
        break;
      done;
 
      eval $prior_state_found 0;
 
      break / if NOT $fgColors;
 
      eval $index 0;
      do / while ($index < $fgColors) and not $prior_state_found;
        eval $index $index+1;
 
        eval $match1 $fgColors[$index] = $foregroundColorKey;
        eval $match2 $bgColors[$index] = $backgroundColorKey;
        eval $match3 $italic[$index]   = $italicState;
        eval $match4 $bold[$index]     = $boldState;
        eval $match5 $underline[$index]= $underlineState;
 
        eval $prior_state_found
              $match1 
          AND $match2 
          AND $match3 
          AND $match4 
          AND $match5
        ;
      done;
 
      do / if NOT $prior_state_found;
        put '</SPAN>';
        unset $fgColors[1];
        unset $bgColors[1];
        unset $italic[1];
        unset $bold[1];
        unset $underline[1];
        trigger save_state;
        break;
      done;
 
      unset $index;
    end;
 
  end; * tagset;
run;
 
/**/
ods tagsets.eeRtfToHtmlTranslator 
  file = "c:\temp\clipboard.html"
  options (rtfFile = "c:\temp\clipboard.rtf" )
;
 
ods tagsets.eeRtfToHtmlTranslator close;
/**/