Let ack(1) compile files with non-ASCII names.

This commit changes how ack(1) parses backslashes in its descr files.
Before this commit, ack set the high bit of each character escaped by
a backslash, and later cleared all high bits in command arguments, but
this lost the high bits in non-ASCII filenames.  After this commit,
ack keeps backslashes in strings while processing them.  Functions
scanvars(), scanexpr(), doassign(), unravel(), addargs() now
understand backslashes.  I remove from ack_basename() the warning
about non-ASCII characters.

This commit makes some incompatible changes for backslashes in descr
files.  None of our descr files uses backslashes, except for those
backslashes that continue lines, and there are no changes for those
backslashes.  The problem with non-ASCII filenames had its cause in a
feature that we weren't using.

With this commit, ack now understands backslashes after the = sign in
both "var NAME=value" and "mapflag -flag NAME=value".  Before, ack
never scanned backslashes in "var" lines, so "var A=\{B}" failed to
prevent expansion of B.  Now it does.  Before, ack did scan for
backslashes in the "-flag NAME=" part of "mapflag" lines.  Now it
doesn't, so it is no longer possible to map a flag that contains a
literal space, tab, or star "*".

I removed the expansion of "{{" to "{".  One can use "\{" for a
literal "{", and "\{" now works in "var" lines.  Before and now, ack
never expanded "{" in flags for "mapflag", so the correct way to map a
literal flag "-{" remains "mapflag -{ ...", not "mapflag -{{ ...".
(The other way "mapflag -\{ ..." stops working with this commit.)

Backslashes in strange places, like "{NA\ME}", probably have different
behavior now.

Backslashes in "program" lines now work.  Before, ack scanned for
backslashes there but forgot to clear the high bits later.

Escaping < or > as \< or \> now works, and prevents substitution of
the input or output file paths.  Before, ack only expanded the first <
or > in each argument.  Now, it expands every unescaped < or > in an
argument, but this is an accident of how I rewrote the code.  I don't
suggest to put more than one each of < or > in a command.  The code no
longer optimizes away its recursive calls when the argument is "<".

The code continues to set or clear the high bit NO_SCAN on the first
characters of flags.  This doesn't seem to be a problem, because flags
usually begin with an ASCII hyphen '-'.
This commit is contained in:
George Koehler 2016-11-14 23:33:44 -05:00
parent fafc8a0b8a
commit 07b04c64c8
5 changed files with 197 additions and 181 deletions

View file

@ -75,14 +75,10 @@ char *getvar(const char *);
/* util.c */
char *ack_basename(const char *);
void clr_noscan(char *);
char *skipblank(char *);
char *firstblank(char *);
void fatal(const char *, ...);
void vprint(const char *, ...);
#ifdef DEBUG
void prns(const char *);
#endif
void fuerror(const char *, ...);
void werror(const char *, ...);
void quit(int);

View file

@ -47,10 +47,6 @@ static char rcs_dmach[] = RCS_DMACH ;
#define CALL "callname"
#define END "end"
/* trans.c */
growstring scanb(const char *) ;
growstring scanvars(const char *) ;
static void intrf(void) ;
static void open_in(char *) ;
static void close_in(void) ;
@ -108,8 +104,8 @@ static int inoptlist(char *nm) {
}
static void intrf(void) {
/* Read in trf (transformation) */
register trf *new ;
growstring bline ;
int twice ;
int name_seen=0 ;
@ -130,20 +126,14 @@ static void intrf(void) {
} else
if ( strcmp(ty_name,PROG)==0 ) {
if ( new->t_prog ) twice=YES ;
bline= scanb(bol); /* Scan for \ */
new->t_prog= gr_final(&bline);
new->t_prog= keeps(bol);
} else
if ( strcmp(ty_name,MAPF)==0 ) {
/* First read the mapflags line
and scan for backslashes */
bline= scanb(bol) ;
l_add(&new->t_mapf,gr_final(&bline)) ;
l_add(&new->t_mapf,keeps(bol)) ;
} else
if ( strcmp(ty_name,ARGS)==0 ) {
if ( new->t_argd ) twice=YES ;
bline= scanb(bol) ;
new->t_argd= keeps(gr_start(bline)) ;
gr_throw(&bline) ;
new->t_argd= keeps(bol) ;
} else
if ( strcmp(ty_name,STD_IN)==0 ) {
if ( new->t_stdin ) twice=YES ;
@ -242,7 +232,7 @@ static void intrf(void) {
register list_elem *elem ;
vprint("%s: from %s to %s '%s'\n",
new->t_name,new->t_in ? new->t_in : "(null)",new->t_out,new->t_prog) ;
vprint("\targs: ") ; prns(new->t_argd) ;
vprint("\targs: %s",new->t_argd) ;
scanlist( l_first(new->t_mapf), elem ) {
vprint("\t%s\n",l_content(*elem)) ;
}

View file

@ -20,9 +20,6 @@ static char rcs_id[] = "$Id$" ;
#define ARG_MORE 40 /* The size of args chunks to allocate */
/* trans.c */
growstring scanvars(const char *);
static int run_exec(trf *, const char *);
static void x_arg(char *);
@ -33,10 +30,8 @@ static unsigned argmax; /* The maximum number of arguments so far */
int runphase(trf *phase) {
register list_elem *elem ;
char *prog ; int result ;
growstring bline ;
bline=scanvars(phase->t_prog) ;
prog=gr_final(&bline) ;
prog=phase->t_prog ;
if ( v_flag || debug ) {
if ( v_flag==1 && !debug ) {
vprint("%s",phase->t_name) ;
@ -71,7 +66,6 @@ int runphase(trf *phase) {
}
x_arg( (char *)0 ) ;
result=run_exec(phase,prog) ;
throws(prog) ;
return result ;
}

View file

@ -33,6 +33,8 @@ static void condit(growstring *, list_head *, list_head *, char *);
static int mapflag(list_head *, const char *);
static int mapexpand(char *, const char *);
static void getcallargs(trf *);
static growstring without_bslash(const char *);
static void getprogram(trf *);
int transform(trf *phase) {
int ok ;
@ -42,6 +44,7 @@ int transform(trf *phase) {
return 0 ;
}
getcallargs(phase) ;
getprogram(phase) ;
ok= runphase(phase) ;
if ( !ok ) rmfile(&out) ;
/* Free the space occupied by the arguments,
@ -78,8 +81,9 @@ void getmapflags(trf *phase) {
scanlist(l_first(phase->t_inputs),elem) {
l_in = p_cont(*elem) ;
if ( mapflag(&(phase->t_mapf),l_in->p_path) ) {
ptr= keeps(getvar(LIBVAR)) ;
clr_noscan(ptr) ;
growstring temp;
temp= without_bslash(getvar(LIBVAR)) ;
ptr= gr_final(&temp);
#ifdef DEBUG
if ( debug >=4 ) {
vprint("phase %s, library %s(%s)\n",
@ -204,71 +208,47 @@ static void set_Rflag(char *argp) {
/* */
/**************************************************************************/
growstring scanb(const char *line) {
/* Scan a line for backslashes, setting the NO_SCAN bit in characters
preceded by a backslash.
*/
const char *in_c ;
int token ;
growstring result ;
enum { TEXT, ESCAPED } state = TEXT ;
gr_init(&result) ;
for ( in_c= line ; *in_c ; in_c++ ) {
token= *in_c&0377 ;
switch( state ) {
case TEXT :
if ( token==BSLASH ) {
state= ESCAPED ;
} else {
gr_add(&result,token) ;
}
break ;
case ESCAPED :
gr_add(&result,token|NO_SCAN) ;
state=TEXT ;
break ;
}
}
gr_add(&result,0) ;
if ( state!=TEXT ) werror("flag line ends with %c",BSLASH) ;
return result ;
}
growstring scanvars(const char *line) {
static growstring scanvars(const char *line) {
/* Scan a line variable replacements started by S_VAR.
Two sequences exist: S_VAR name E_VAR, S_VAR name A_VAR text E_VAR.
Two sequences exist: S_VAR name C_VAR, S_VAR name A_VAR text C_VAR.
neither name nor text may contain further replacements.
In the first form an error message is issued if the name is not
present in the variables, the second form produces text
in that case.
The sequence S_VAR S_VAR is transformed into S_VAR.
This to allow later recognition in mapflags, where B_SLASH
would be preventing any recognition.
*/
const char *in_c ;
int token ;
int token, token_r ;
growstring result, name ;
char *tr ;
enum { TEXT, FIRST, NAME, SKIP, COPY } state = TEXT ;
int escaped = NO;
gr_init(&result) ; gr_init(&name) ;
for ( in_c= line ; *in_c ; in_c++ ) {
token= *in_c&0377 ;
token_r= (escaped ? 0 : token);
/* A backslash escapes the next character. */
if ( token_r==BSLASH ) {
if ( state==TEXT || state==COPY ) {
/* Keep BSLASH for later scans. */
gr_add(&result,token) ;
}
escaped= YES;
continue;
}
escaped= NO;
switch( state ) {
case TEXT :
if ( token==S_VAR ) {
if ( token_r==S_VAR ) {
state= FIRST ;
} else {
gr_add(&result,token) ;
}
break ;
case FIRST :
switch ( token ) {
case S_VAR :
state= TEXT ;
gr_add(&result,token) ;
break ;
switch ( token_r ) {
case A_VAR :
case C_VAR :
fatal("empty string variable name") ;
@ -279,7 +259,7 @@ growstring scanvars(const char *line) {
}
break ;
case NAME:
switch ( token ) {
switch ( token_r ) {
case A_VAR :
gr_add(&name,0) ;
if ( tr=getvar(gr_start(name)) ) {
@ -311,16 +291,17 @@ growstring scanvars(const char *line) {
}
break ;
case SKIP :
if ( token==C_VAR ) state= TEXT ;
if ( token_r==C_VAR ) state= TEXT ;
break ;
case COPY :
if ( token==C_VAR ) state= TEXT ; else {
if ( token_r==C_VAR ) state= TEXT ; else {
gr_add(&result,token) ;
}
break ;
}
}
gr_add(&result,0) ;
if ( escaped ) werror("flag line ends with %c",BSLASH) ;
if ( state!=TEXT ) {
werror("flag line misses %c",C_VAR) ;
gr_throw(&name) ;
@ -332,68 +313,80 @@ static growstring scanexpr(const char *line) {
/* Scan a line for conditional or flag expressions,
dependent on the type. The format is
S_EXPR suflist M_EXPR suflist T_EXPR tail C_EXPR
the head and tail are passed to treat, together with the
the head and tail are passed to condit(), together with the
growstring for futher treatment.
Nesting is not allowed.
*/
const char *in_c, *heads ;
int token ;
int token, token_r ;
growstring sufs, tailval, result ;
static list_head fsuff, lsuff ;
enum { TEXT, FDOT, FSUF, LDOT, LSUF, FTAIL } state = TEXT ;
int escaped = NO;
gr_init(&result) ; gr_init(&sufs) ; gr_init(&tailval) ;
for ( in_c= line ; *in_c ; in_c++ ) {
token= *in_c&0377 ;
token_r= (escaped ? 0 : token);
/* A backslash escapes the next character. */
if ( token_r==BSLASH ) {
if ( state==TEXT || state==FTAIL ) {
/* Keep BSLASH for later scans. */
gr_add(&result,token) ;
}
escaped= YES;
continue;
}
escaped= NO;
switch( state ) {
case TEXT :
if ( token==S_EXPR ) {
if ( token_r==S_EXPR ) {
state= FDOT ;
heads=in_c ;
} else gr_add(&result,token) ;
break ;
case FDOT :
if ( token==M_EXPR ) {
if ( token_r==M_EXPR ) {
state=LDOT ;
break ;
}
token &= ~NO_SCAN ;
if ( token!=SUFCHAR ) {
error("Missing %c in expression",SUFCHAR) ;
}
gr_add(&sufs,token) ; state=FSUF ;
break ;
case FSUF :
if ( token==M_EXPR || (token&~NO_SCAN)==SUFCHAR) {
if ( token_r==M_EXPR || token==SUFCHAR ) {
gr_add(&sufs,0) ;
l_add(&fsuff,gr_final(&sufs)) ;
}
if ( token==M_EXPR ) {
if ( token_r==M_EXPR ) {
state=LDOT ;
} else gr_add(&sufs,token&~NO_SCAN) ;
} else gr_add(&sufs,token) ;
break ;
case LDOT :
if ( token==T_EXPR ) {
if ( token_r==T_EXPR ) {
state=FTAIL ;
break ;
}
token &= ~NO_SCAN ;
if ( token!=SUFCHAR ) {
error("Missing %c in expression",SUFCHAR) ;
}
gr_add(&sufs,token) ; state=LSUF ;
break ;
case LSUF :
if ( token==T_EXPR || (token&~NO_SCAN)==SUFCHAR) {
if ( token_r==T_EXPR || token==SUFCHAR) {
gr_add(&sufs,0) ;
l_add(&lsuff,gr_final(&sufs)) ;
}
if ( token==T_EXPR ) {
if ( token_r==T_EXPR ) {
state=FTAIL ;
} else gr_add(&sufs,token&~NO_SCAN) ;
} else gr_add(&sufs,token) ;
break ;
case FTAIL :
if ( token==C_EXPR ) {
if ( token_r==C_EXPR ) {
/* Found one !! */
gr_add(&tailval,0) ;
condit(&result,&fsuff,&lsuff,gr_start(tailval)) ;
@ -503,6 +496,7 @@ static int mapexpand(char *mapentry, const char *cflag) {
void doassign(const char *line, const char *star, int length) {
growstring varval, name, temp ;
const char *ptr ;
int escaped = NO ;
gr_init(&varval) ;
gr_init(&name) ;
@ -517,12 +511,21 @@ void doassign(const char *line, const char *star, int length) {
}
temp= scanvars(ptr+1) ;
for ( ptr=gr_start(temp); *ptr; ptr++ ) switch ( *ptr ) {
case BSLASH :
escaped= YES ;
gr_add(&varval,*ptr) ;
break ;
case STAR :
if ( star ) {
while ( length-- ) gr_add(&varval,*star++|NO_SCAN) ;
if ( star && !escaped ) {
while ( length-- ) {
gr_add(&varval,BSLASH) ;
gr_add(&varval,*star++) ;
}
break ;
}
/* FALLTHROUGH */
default :
escaped= NO ;
gr_add(&varval,*ptr) ;
break ;
}
@ -533,15 +536,16 @@ void doassign(const char *line, const char *star, int length) {
#define ISBLANK(c) ( (c)==SPACE || (c)==TAB )
static void unravel(char *line, void (*action)(char *)) {
static void unravel(const char *line, void (*action)(char *)) {
/* Unravel the line, get arguments a la shell */
/* each argument is handled to action */
/* The input string is left intact */
register char *in_c ;
register int token ;
enum { BLANK, ARG } state = BLANK ;
const char *in_c ;
int token ;
enum { BLANK, ARG, ESCAPED } state = BLANK ;
growstring argum ;
/* Loop for each character of line, including final '\0' */
in_c=line ;
for (;;) {
token= *in_c&0377 ;
@ -549,9 +553,13 @@ static void unravel(char *line, void (*action)(char *)) {
case BLANK :
if ( token==0 ) break ;
if ( !ISBLANK(token) ) {
state= ARG ;
gr_init(&argum) ;
gr_add(&argum,token&~NO_SCAN) ;
gr_add(&argum,token) ;
if ( token == BSLASH ) {
state= ESCAPED ;
} else {
state= ARG ;
}
}
break ;
case ARG :
@ -561,84 +569,94 @@ static void unravel(char *line, void (*action)(char *)) {
gr_throw(&argum) ;
state=BLANK ;
} else {
gr_add(&argum,token&~NO_SCAN) ;
gr_add(&argum,token) ;
if ( token == BSLASH ) state= ESCAPED ;
}
break ;
case ESCAPED :
gr_add(&argum,token) ;
state= ARG ;
break ;
}
if ( token == 0 ) break ;
in_c++ ;
}
}
static char *c_rep(char *string, char *place, char *rep) {
/* Produce a string in stable storage produced from 'string'
with the character at place replaced by rep
*/
growstring name ;
register char *nc ;
register char *xc ;
gr_init(&name) ;
for ( nc=string ; *nc && nc<place ; nc++ ) {
gr_add(&name,*nc) ;
}
#ifdef DEBUG
if ( *nc==0 ) fatal("Place is not in string") ;
#endif
for ( xc=rep ; *xc ; xc++ ) gr_add(&name,*xc|NO_SCAN) ;
gr_add(&name,0) ;
gr_cat(&name,nc+1) ;
return gr_final(&name) ;
}
static list_head *curargs ;
static list_head *comb_args ;
static void addargs(char *string) {
register char *temp, *repc ;
register list_elem *elem ;
static void addargs3(const char *prefix1, const char *prefix2,
const char *string)
{
/* Prepend prefix1 and prefix2 to string, then add it to
curargs. string is scanned to strip backslashes and
substitute files for C_IN and C_OUT. prefixes are not
scanned.
*/
const char *in_c ;
int token ;
char *tr ;
list_elem *elem ;
growstring argum ;
int escaped = NO ;
repc=strchr(string,C_IN) ;
if ( repc ) {
/* INPUT FILE TOKEN seen, replace it and scan further */
if ( repc==string && string[1]==0 ) {
if ( in.p_path ) { /* All but combiner */
l_add(curargs,keeps(in.p_path)) ;
} else {
scanlist( l_first(*comb_args), elem ) {
l_add(curargs,
keeps(p_cont(*elem)->p_path)) ;
gr_init(&argum);
for ( in_c= prefix1 ; *in_c ; in_c++ ) {
gr_add(&argum,*in_c) ;
}
for ( in_c= prefix2 ; *in_c ; in_c++ ) {
gr_add(&argum,*in_c) ;
}
return ;
for ( in_c= string ; *in_c ; in_c++ ) {
token= *in_c&0377 ;
if ( escaped ) {
/* Strip BSLASH, keep escaped character. */
gr_add(&argum,token) ;
escaped= NO ;
continue;
}
switch ( token ) {
case BSLASH:
escaped= YES ;
break;
case C_IN: /* Input file */
if ( in.p_path ) { /* Not for the combiners */
temp=c_rep(string,repc,in.p_path) ;
addargs(temp) ;
throws(temp) ;
for ( tr= in.p_path ; *tr; tr++ ) {
gr_add(&argum,*tr);
}
} else { /* For the combiners */
gr_add(&argum,0);
tr= gr_final(&argum);
in_c++;
scanlist( l_first(*comb_args), elem ) {
temp=c_rep(string,repc,p_cont(*elem)->p_path);
addargs(temp) ;
throws(temp) ;
}
char *p = p_cont(*elem)->p_path ;
addargs3(tr,p,in_c) ;
}
throws(tr);
return;
}
repc=strchr(string,C_OUT) ;
if ( repc ) {
/* replace the outfile token as with the infile token */
break;
case C_OUT: /* Output file */
#ifdef DEBUG
if ( !out.p_path ) fatal("missing output filename") ;
#endif
temp=c_rep(string,repc,out.p_path) ;
addargs(temp) ;
throws(temp) ;
return ;
for ( tr= out.p_path ; *tr ; tr++ ) {
gr_add(&argum,*tr) ;
}
temp= keeps(string) ;
clr_noscan(temp) ;
l_add(curargs,temp) ;
break;
default:
gr_add(&argum,token) ;
break;
}
}
gr_add(&argum,0) ;
tr= gr_final(&argum) ;
l_add(curargs,tr) ;
}
static void addargs(char *string) {
addargs3("", "", string) ;
}
static void getcallargs(trf *phase) {
@ -646,11 +664,11 @@ static void getcallargs(trf *phase) {
arg1= scanvars(phase->t_argd) ;
#ifdef DEBUG
if ( debug>=3 ) { vprint("\tvars: ") ; prns(gr_start(arg1)) ; }
if ( debug>=3 ) vprint("\tvars: %s", gr_start(arg1)) ;
#endif
arg2= scanexpr(gr_start(arg1)) ;
#ifdef DEBUG
if ( debug>=3 ) { vprint("\texpr: ") ; prns(gr_start(arg2)) ; }
if ( debug>=3 ) vprint("\texpr: %s", gr_start(arg2)) ;
#endif
gr_throw(&arg1) ;
curargs= &phase->t_args ;
@ -658,3 +676,40 @@ static void getcallargs(trf *phase) {
unravel( gr_start(arg2), addargs ) ;
gr_throw(&arg2) ;
}
static growstring without_bslash(const char *string) {
/* Strip backslashes from a copy of the string. */
growstring result;
const char *in_c ;
int token ;
int escaped = NO ;
gr_init(&result) ;
for ( in_c= string ; *in_c ; in_c++ ) {
token= *in_c&0377 ;
if ( token==BSLASH && !escaped ) {
escaped= YES ;
} else {
gr_add(&result,token);
escaped= NO ;
}
}
gr_add(&result,0);
return result;
}
static void getprogram(trf *phase) {
growstring prog1, prog2 ;
const char *in_c ;
int token ;
int escaped = NO ;
/* Expand string variables in program name. */
prog1= scanvars(phase->t_prog) ;
throws(phase->t_prog) ;
/* Strip backslashes. */
prog2= without_bslash(gr_start(prog1));
gr_throw(&prog1);
phase->t_prog= gr_final(&prog2);
}

View file

@ -46,9 +46,6 @@ char *ack_basename(const char *string) {
case '/' : last_start=fetch+1 ; break ;
case 0 : goto out ;
}
if ( !isascii(ctoken) || !isprint(ctoken) ) {
werror("non-ascii characters in argument %s",string) ;
}
}
out:
if ( ! *last_start ) fuerror("empty filename \"%s\"",string) ;
@ -61,13 +58,6 @@ out:
return retval ;
}
void clr_noscan(char *str) {
register char *ptr ;
for ( ptr=str ; *ptr ; ptr++ ) {
*ptr&= ~NO_SCAN ;
}
}
char *skipblank(char *str) {
register char *ptr ;
@ -105,15 +95,6 @@ void vprint(const char* fmt, ...)
va_end(ap);
}
#ifdef DEBUG
void prns(const char *s) {
for ( ; *s ; s++ ) {
putc((*s&0377)&~NO_SCAN,STDOUT) ;
}
putc('\n',STDOUT) ;
}
#endif
/* VARARGS1 */
void fuerror(const char *fmt, ...) {
/* Fatal user error */