As the first step in the decommissioning of sasCommunity.org the site has been converted to read-only mode.


Here are some tips for How to share your SAS knowledge with your professional network.


Difference between revisions of "Appendices: RUN MACRO Run! SGF Paper 033-2013"

From sasCommunity
Jump to: navigation, search
(add wikilinks)
 
Line 3: Line 3:
 
== Here are the routines as mentioned at the end of the paper.  ==
 
== Here are the routines as mentioned at the end of the paper.  ==
  
This function allows SAS to guess at a variable's type, rather than have to manually investigate a data set.  The function is meant to be called on the sashelp view of the dictionary table.
+
This function allows [[SAS]] to guess at a variable's type, rather than have to manually investigate a data set.  The [[function]] is meant to be called on the sashelp view of the dictionary table.
  
 
<source lang="sas">  
 
<source lang="sas">  

Latest revision as of 12:21, 19 September 2017

For the page about the paper, see RUN MACRO Run! SGF Paper 033-2013.

Here are the routines as mentioned at the end of the paper.

This function allows SAS to guess at a variable's type, rather than have to manually investigate a data set. The function is meant to be called on the sashelp view of the dictionary table.

 
data enhanced_vcolumn;       
 set sashelp.vcolumn;
where libname="SASHELP" and memname="CLASS"; *select data sets and variables of interest;
 
length dsn $ 200 varcat $30; 
dsn=catt(libname,'.',memname); *form compound data set name;
 
 
varcat = Categorize_Variable(trim(dsn), name, type, length, format, informat);
 
*varcat will be the best guess as to the variables 'type':
 possible values - 
          'Empty' 
          'Indicator' 
          'Date'
          'Categorical'
          'Likely ID Variable'
          'Numeric Measure'
          'Character String'
;
 
keep libname memname name type length format informat varcat;
run;


The function starts by calling a proc freq on the variable of interest. Then it calls a series of sub-macros to perform tests on the freq output and return metrics to the function, where they are ultimately combined using crude heuristics into a guess about the variable type.

 
/*The Wrapper Function*/
 
proc fcmp outlib=work.functions.wrapper;
 
  function Categorize_Variable(dsn $, variable $, type $, length, format $, informat $) $;
 
		   *get number of observations in original data set;
           rc = run_macro('get_nobs', dsn, nobs);
 
		   *produce frequencies data set for desired variable;
		   freqdata='_FCMP_freq'; *name of freq data set;
		   rc = run_macro('run_freq', dsn, variable, freqdata);
 
		   *get number of missing values for variable;
		   rc = run_macro('get_nmiss', freqdata, nmiss); *also drop missings from freqdata;
 
		   *get number of unique values for variable;
           rc = run_macro('get_nlevels', freqdata, nlevels); *nlevels=nobs(freq);
 
		   *check if variable appears to be an 'indicator' variable;
           if nlevels in (1,2) then rc = run_macro('check_indicator', freqdata, indicator);
 
		   *check if variable contains digits, spaces, or a non-numeric character;
		   if type='char' then str_length=length; else str_length=101; *stores numbers up to 1 Googol;
           rc = run_macro('check_symbols', freqdata, str_length, anyalpha, anydigit, anyspace, alldigit);
 
		   *check if variable is a string representing numeric values;
           if type='char' and anyalpha=0 then stringnum = 1; 
                                         else stringnum = 0;
 
           *summarize value lengths for variable;
           rc = run_macro('get_val_lengths', freqdata, str_length, minlength, medlength, maxlength);
		   lengthdif = maxlength - minlength;
 
           *check if variable represents a date;
		   date = 0;
           if index(informat,'YY') or index(informat,'DATE')
		   or index(  format,'YY') or index(  format,'DATE') then date = 1; *check if format or informat is date;
           if date=0 and (type='char' or alldigit) then rc = run_macro('check_date', freqdata, type, format, maxlength, date);
 
           *check if variable values appear a consistent number of times;
           rc=run_macro('check_commonfreq',freqdata,commonfreq);
 
	  *cleanup work library;
           lib='WORK'; 
           list='_FCMP_freq'; 
           rc=run_macro('proc_datasets_delete', lib, list);
 
		   *heuristics to determine likelihood of variable being an ID; 
			if nlevels > .90*nobs    then ID_score + 3; *1-to-1 with obs;
			if commonfreq > .40      then ID_score + 2; *common frequency;
			if missing < .05*nobs    then ID_score + 1; *few missing values;
			if anyalpha and anydigit then ID_score + 2; *values are alphanumeric;
			if (alldigit or anyalpha) and lengthdif < 2 then ID_score + 1; *values are of consistent length;
			if (alldigit or anyalpha) and anyspace  = 0 then ID_score + 1; *values represent single 'word';	
 
           length varcat $30;
 
		   *implement decision rules to return a best guess for variable type;
           if nmiss=nobs then varcat='Empty'; else
           if indicator then varcat='Indicator'; else
		   if date then varcat='Date'; else
		   if (type='char' and anyalpha=1  and nlevels < max(15,.1*(nobs)))
           or ((type='num' or  anyalpha=0) and ((nlevels < 15) or (commonfreq < .25))) 
           then varcat='Categorical'; else
	       if ID_score > 3 then varcat='Likely ID Variable'; else
	       if type='num'   then varcat='Numeric Measure'; else 
	       if type='char'  then varcat='Character String';
 
		   return(trim(varcat));
  endsub;
 
quit;
 
options cmplib=work.functions;


And here are the various submacros that are called by the function:

 
%macro get_nobs();
*return nobs from original dataset;
 
  %let dsn =%sysfunc(dequote(&dsn)); 
  %let nobs=0;
 
	  Data _null_;
	  call symput ('nobs',totobs);
	  if 0 then set &dsn nobs=totobs;
	  run;
 
  %let nobs=%sysfunc(trim(%sysfunc(left(&nobs))));
%mend get_nobs;
 
%macro run_freq();
*run frequency of variable of interest;
 
	%let dsn     =%sysfunc(dequote(&dsn)); 
	%let variable=%sysfunc(dequote(&variable));
	%let freqdata=%sysfunc(dequote(&freqdata));
 
		proc freq 
		data=&dsn (keep=&variable);
		tables &variable /out= &freqdata (keep=&variable count rename=(&variable=variable)) missing;
		run;
%mend run_freq;
 
 
%macro get_nmiss();
*return number of missing values;
*also drop missings from frequency table;
 
	%let freqdata=%sysfunc(dequote(&freqdata));
	%let nmiss=0;
 
		Data &freqdata;
		 set &freqdata; *might not catch special missings;
		if missing(variable) 
		   then call symput ('nmiss',count);
		   else output; *drop missings from freq;
		run;
 
	%let nmiss=%sysfunc(trim(%sysfunc(left(&nmiss))));
	%if %length(&nmiss)=0 %then %let nmiss=0;
%mend get_nmiss;
 
 
%macro get_nlevels();
*return number of unique non-missing values;
 
	%let freqdata=%sysfunc(dequote(&freqdata));
	%let nlevels =0;
 
		Data _null_;
		call symput ('nlevels',nlevels);
		if 0 then set &freqdata nobs=nlevels;
		run;
 
	%let nlevels=%sysfunc(trim(%sysfunc(left(&nlevels))));
%mend get_nlevels;
 
 
%macro check_indicator();
*return indicator for whether variable is an indicator;
 
	%let freqdata =%sysfunc(dequote(&freqdata));
	%let indicator=0;
 
		Data _null_; 
		 set &freqdata. end=eof;
 
		*crude transpose for up to 2 obs;
		length var1 var2 $32;
		retain var1 var2;
 
		if _N_ =1 then var1=variable;
		var2=variable; *if nlevels=1;
		if _N_ =2 then var2=variable;
 
		indicator=0;
 
		if eof then do;
 
		*declare indicator if values are between -2 and 2, or are y/n;
	         if notdigit(var1)=0 and notdigit(var2)=0 
	         and abs(var1)<3 and abs(var2)<3  then indicator=1; else
	         if  upcase(strip(var1)) in: ('Y','N') 
	         and upcase(strip(var2)) in: ('Y','N') then indicator=1;
 
		call symput('indicator',indicator); 
 
		end;
 
		Run;
 
	%let indicator=%sysfunc(trim(%sysfunc(left(&indicator))));
%mend check_indicator;
 
 
%macro check_symbols();
*return indicators if values are numeric, alphanumeric, single-word, etc.;
 
	%let str_length=%sysfunc(dequote(&str_length));
	%let freqdata  =%sysfunc(dequote(&freqdata));
 
		Data &freqdata (drop=ANYA ANYD ANYS ALLD ANYALPHA ANYDIGIT ANYSPACE ALLDIGIT string digits);
		 set &freqdata end = eof;
 
		RETAIN ANYA ANYD ANYS ALLD 0; *variable-level indicators;
 
		*see if is numeric stored in character;
		length string $ &str_length.;
		string=trim(left(substr(variable,1)));         *string form;
		digits=input(compress(string,',$%'),?? BEST.); *numeric form;
 
		*check each of the unique values in turn;
		ANYALPHA=missing(digits);
		ANYDIGIT=(anydigit(string)>0);
		ANYSPACE=(countw(string,' ')>1);
		ALLDIGIT=not (missing(digits) or notdigit(trim(left(digits))));
		*ALLDIGIT=0 means char, or num that contains . - or E;
 
		*update variable-level indicators;
		ANYA = max(ANYA, ANYALPHA);
		ANYD = max(ANYD, ANYDIGIT);
		ANYS = max(ANYS, ANYSPACE);
		if _N_ = 1 then ALLD = ALLDIGIT;
		           else ALLD = min(ALLD, ALLDIGIT);
 
		if eof then do;
			call symput('ANYALPHA',trim(left(ANYA)));
			call symput('ANYDIGIT',trim(left(ANYD)));
			call symput('ANYSPACE',trim(left(ANYS)));
			call symput('ALLDIGIT',trim(left(ALLD)));
		end;
		Run;
%mend check_symbols;
 
 
%macro get_val_lengths();
*return range of value lengths;
 
	%let freqdata  =%sysfunc(dequote(&freqdata));
	%let str_length=%sysfunc(dequote(&str_length));
 
		Data _FCMP_freqlengths;
		 set &freqdata;
		length string $ &str_length.;
		string=trim(left(substr(variable,1)));  *string form;
		val_length=lengthN(string);*get length of each VALUE;
		run;
 
		%let maxlength=0;
		%let medlength=0;
		%let minlength=0;
 
		Proc Means
		Data=_FCMP_freqlengths;
		Var val_length;
		Freq Count;
		Output Out=_FCMP_lengths MIN=min MEDIAN=med MAX=max;
		Run;
 
		data _null_;
		 set _FCMP_lengths;
		call symput('maxlength',trim(left(max)));
		call symput('medlength',trim(left(med)));
		call symput('minlength',trim(left(min)));
		Run;
 
	proc datasets lib=work nolist; 
	delete _FCMP_freqlengths _FCMP_lengths; 
	quit;
%mend get_val_lengths;
 
 
%macro check_date();
*return indicator for whether variable is a date;
 
	%let freqdata  =%sysfunc(dequote(&freqdata));
	%let maxlength =%sysfunc(dequote(&maxlength));
	%let format    =%sysfunc(dequote(&format));
	%let DATE=0;
 
		data _null_;
		*generate in case date is in format mddyyyy, possibly missing leading zero;
		call symput('zmaxlength',catt('Z',left(&maxlength),'.'));
		call symput('zmax_plus1',catt('Z',left(&maxlength+1),'.'));
		run;
 
		Data _FCMP_freqdates;
		 set &freqdata end = eof;
 
		RETAIN DATEC DATEF DATEZ DATEZ1 0; *variable-level indicators;
 
		%if %length(&format.)>0 %then %do;
		*put formatted value in case is SAS date;
		ANYDATEF=input(put(variable,&format.), ?? ANYDTDTE.);
		%end;
 
		%if %index(&type,char) %then %do;
		*try to read character string as date;
		ANYDATEC=input(variable, ?? ANYDTDTE.);
		%end;
		%else %if %index(&type,num) and %eval(&maxlength < 31) %then %do;
			ANYDATEZ=input(put(variable,&zmaxlength.), ?? ANYDTDTE.);
			ANYDATEZ1=input(put(variable,&zmax_plus1.), ?? ANYDTDTE.);
		%end;
 
		DATEF  + missing(ANYDATEF);
		DATEC  + missing(ANYDATEC);
		DATEZ  + missing(ANYDATEZ);
		DATEZ1 + missing(ANYDATEZ1);
 
		if eof then do; 
		*if no missing, then one was successful in reading as date;
		call symput('DATE', min(DATEC, DATEF, DATEZ, DATEZ1)=0); 
		end;
 
		run;
 
	%let DATE=%sysfunc(trim(%sysfunc(left(&DATE))));
 
	proc datasets lib=work nolist; 
	delete _FCMP_freqdates; 
	quit;
%mend check_date;
 
 
%macro check_commonfreq();
*return percentage of values with a common frequency;
 
	%let freqdata   =%sysfunc(dequote(&freqdata));
	%let CommonFreq=0;
 
		*run freqs of freqs;
		proc freq 
		data=&freqdata (keep=Count rename=(Count=Freq));
		tables freq /out=_FCMP_freq2 missing;
		run;
 
		Proc SQL noprint;
		Select max(round(Percent,.1)) into :CommonFreq
		From _FCMP_freq2
		;
		Quit;
 
	%let CommonFreq=%sysfunc(trim(%sysfunc(left(&CommonFreq))));
 
	proc datasets lib=work nolist; delete _FCMP_freq2; quit;
%mend check_commonfreq;
 
 
%macro proc_datasets_delete();
 
	%let lib =%sysfunc(dequote(&lib));
	%let list=%sysfunc(dequote(&list));
 
		proc datasets lib=&lib nolist; delete &list; quit;
 
%mend proc_datasets_delete;



Call Routine Example

This call routine returns numerous values which give information as to whether two variables are the same. To call it you have to have a data set of paired data set names and variable names. In the paper I describe how you can join the dictionary table to itself to see if the 'Likely ID Variables' described above are the same across files.

 
proc fcmp outlib=work.functions.wrapper;
 
  subroutine Merge_Metrics(Dataset1 $, Var1 $, Dataset2 $, Var2 $,
             NOBS1, missing1, nonmiss1, numobsmerged1, pctobsmerged1, nlevels1, numvalmerged1, pctvalmerged1,
             NOBS2, missing2, nonmiss2, numobsmerged2, pctobsmerged2, nlevels2, numvalmerged2, pctvalmerged2,
		     TotPct_valmerged, TotPct_obsmerged, Similarity_Index);
 
     outargs NOBS1, missing1, nonmiss1, numobsmerged1, pctobsmerged1, nlevels1, numvalmerged1, pctvalmerged1,
             NOBS2, missing2, nonmiss2, numobsmerged2, pctobsmerged2, nlevels2, numvalmerged2, pctvalmerged2,
		     TotPct_valmerged, TotPct_obsmerged, Similarity_Index;
 
     rc = run_macro('Merge_Metrics', Dataset1, Var1, Dataset2, Var2,
             NOBS1, missing1, nonmiss1, numobsmerged1, pctobsmerged1, nlevels1, numvalmerged1, pctvalmerged1,
             NOBS2, missing2, nonmiss2, numobsmerged2, pctobsmerged2, nlevels2, numvalmerged2, pctvalmerged2,
			 TotPct_valmerged, TotPct_obsmerged, Similarity_Index);
  endsub;
 
quit;
 
options cmplib=work.functions;


 
%macro Merge_Metrics();
 
%let dataset1=%SYSFUNC(dequote(&dataset1));
%let var1=%SYSFUNC(dequote(&var1));
 
%let dataset2=%SYSFUNC(dequote(&dataset2));
%let var2=%SYSFUNC(dequote(&var2));
 
ods noresults;
ods listing close;
 
proc freq 
data=&dataset1 (keep=&var1.);
tables &var1. /out=_FCMP_freq1(keep=&var1. count) missing;
run;
 
proc freq 
data=&dataset2 (keep=&var2.);
tables &var2. /out=_FCMP_freq2(keep=&var2. count) missing;
run;
 
Proc SQL;
Create Table _FCMP_Merge As
Select d1.&var1. as var1,
       d1.count  as N1,
       d2.&var2. as var2,
	   d2.count  as N2,
	   not missing(d1.count) as in_d1,
	   not missing(d2.count) as in_d2,
	   (not missing(d1.count)) and (not missing(d2.count)) as merged
From _FCMP_freq1 as d1 FULL JOIN _FCMP_freq2 as d2
  On d1.&var1.=d2.&var2.;
Quit;
 
Proc Sort Data = _FCMP_Merge;
       By merged in_d1 in_d2; 
Run;
 
Data _NULL_;
call symput('nobs1',totobs);
if 0 then set &dataset1 (drop=_ALL_) nobs=totobs;
Run;
 
Data _NULL_;
call symput('nobs2',totobs);
if 0 then set &dataset2 (drop=_ALL_) nobs=totobs;
Run;
 
Data _NULL_;
call symput('nlevels1',nlevels);
if 0 then set _FCMP_freq1 (drop=_ALL_) nobs=nlevels;
Run;
 
Data _NULL_;
call symput('nlevels2',nlevels);
if 0 then set _FCMP_freq2 (drop=_ALL_) nobs=nlevels;
Run;
 
Data _NULL_;
Retain NOBS1 missing1 nonmiss1 numobsmerged1 pctobsmerged1 nlevels1 numvalmerged1 pctvalmerged1
       NOBS2 missing2 nonmiss2 numobsmerged2 pctobsmerged2 nlevels2 numvalmerged2 pctvalmerged2
       0;
 Set _FCMP_Merge end=eof;
 
if missing(var1) and missing(var2) then do;
missing1 = ifn(missing(N1),0,N1);
missing2 = ifn(missing(N2),0,N2);
end;
 
if merged then do;
numvalmerged1 + 1;
numobsmerged1 + N1;
 
numvalmerged2 + 1;
numobsmerged2 + N2;
end;
 
if eof then do;
 
*produce output statistics;
 
    NOBS1 = &nobs1.;
    NOBS2 = &nobs2.;
 
    nonmiss1= sum(NOBS1, -1*missing1);
    nonmiss2= sum(NOBS2, -1*missing2);
 
    Nlevels1 = &nlevels1. - (missing1>0);
    Nlevels2 = &nlevels2. - (missing2>0);
 
    pctvalmerged1 = numvalmerged1/Nlevels1;
    pctvalmerged2 = numvalmerged2/Nlevels2;
 
    if nonmiss1 > 0 then do;
    pctobsmerged1 = numobsmerged1/nonmiss1;
    end;
 
    if nonmiss2 > 0 then do;
    pctobsmerged2 = numobsmerged2/nonmiss2;
    end;
 
    if sum(nonmiss1, nonmiss2) > 0 then do;
    TotPct_valmerged = (numvalmerged1 + numvalmerged2)/(Nlevels1 + Nlevels2); 
    TotPct_obsmerged = (numobsmerged1 + numobsmerged2)/(nonmiss1 + nonmiss2);
    Similarity_Index = TotPct_valmerged*TotPct_obsmerged;
    end;
 
call symput('numvalmerged1', numvalmerged1);
call symput('numvalmerged2', numvalmerged2);
call symput('pctvalmerged1', pctvalmerged1);
call symput('pctvalmerged2', pctvalmerged2);
 
call symput('numobsmerged1', numobsmerged1);
call symput('numobsmerged2', numobsmerged2);
call symput('pctobsmerged1', pctobsmerged1);
call symput('pctobsmerged2', pctobsmerged2);
 
*call symput('nobs1', nobs1);
*call symput('nobs2', nobs2);
 
call symput('missing1', missing1);
call symput('missing2', missing2);
 
call symput('nonmiss1', nonmiss1);
call symput('nonmiss2', nonmiss2);
 
call symput('nlevels1', nlevels1);
call symput('nlevels2', nlevels2);
 
call symput('TotPct_valmerged', TotPct_valmerged);
call symput('TotPct_obsmerged', TotPct_obsmerged);
call symput('Similarity_Index', Similarity_Index);
 
end; *end eof;
 
Run;
 
Proc Datasets lib=WORK nolist; delete _FCMP_:; Quit;
 
%mend  Merge_Metrics;