Sun Mar 27 14:39:33 2016 UTC ()
PR bin/50993 - this is a significant rewrite of the way that here
documents are processed.  Now, when first detected, they are
simply read (the only change made to the text is to join lines
ended with a \ to the subsequent line, otherwise end marker detection
does not work correctly (for here docs with an unquoted endmarker
only of course.)  This patch also moves the "internal subroutine"
for looking for the end marker out of readtoken1() (which had to
happen as readtoken1 is no longer reading the here doc when it is
needed) - that uses code mostly taken from FreeBSD's sh (thanks!)
and along the way results in some restrictions on what the end
marker can be being removed.   We still do not allow all we should.
(from kre@)


(christos)
diff -r1.98 -r1.99 src/bin/sh/expand.c
diff -r1.110 -r1.111 src/bin/sh/parser.c
diff -r1.19 -r1.20 src/bin/sh/parser.h

cvs diff -r1.98 -r1.99 src/bin/sh/expand.c (expand / switch to context diff)
--- src/bin/sh/expand.c 2016/03/27 14:34:46 1.98
+++ src/bin/sh/expand.c 2016/03/27 14:39:33 1.99
@@ -1,4 +1,4 @@
-/*	$NetBSD: expand.c,v 1.98 2016/03/27 14:34:46 christos Exp $	*/
+/*	$NetBSD: expand.c,v 1.99 2016/03/27 14:39:33 christos Exp $	*/
 
 /*-
  * Copyright (c) 1991, 1993
@@ -37,7 +37,7 @@
 #if 0
 static char sccsid[] = "@(#)expand.c	8.5 (Berkeley) 5/15/95";
 #else
-__RCSID("$NetBSD: expand.c,v 1.98 2016/03/27 14:34:46 christos Exp $");
+__RCSID("$NetBSD: expand.c,v 1.99 2016/03/27 14:39:33 christos Exp $");
 #endif
 #endif /* not lint */
 
@@ -121,6 +121,12 @@
 void
 expandhere(union node *arg, int fd)
 {
+	/*
+	 * First, parse the content of the here doc (to internal form)
+	 * It was initially saved as (almost) unmodified text.
+	 */
+	parse_heredoc(arg);
+
 	herefd = fd;
 	expandarg(arg, NULL, 0);
 	xwrite(fd, stackblock(), expdest - stackblock());

cvs diff -r1.110 -r1.111 src/bin/sh/parser.c (expand / switch to context diff)
--- src/bin/sh/parser.c 2016/03/27 14:36:29 1.110
+++ src/bin/sh/parser.c 2016/03/27 14:39:33 1.111
@@ -1,4 +1,4 @@
-/*	$NetBSD: parser.c,v 1.110 2016/03/27 14:36:29 christos Exp $	*/
+/*	$NetBSD: parser.c,v 1.111 2016/03/27 14:39:33 christos Exp $	*/
 
 /*-
  * Copyright (c) 1991, 1993
@@ -37,7 +37,7 @@
 #if 0
 static char sccsid[] = "@(#)parser.c	8.7 (Berkeley) 5/16/95";
 #else
-__RCSID("$NetBSD: parser.c,v 1.110 2016/03/27 14:36:29 christos Exp $");
+__RCSID("$NetBSD: parser.c,v 1.111 2016/03/27 14:39:33 christos Exp $");
 #endif
 #endif /* not lint */
 
@@ -69,8 +69,6 @@
  * Shell command parser.
  */
 
-#define EOFMARKLEN 79
-
 /* values returned by readtoken */
 #include "token.h"
 
@@ -111,11 +109,12 @@
 STATIC union node *simplecmd(union node **, union node *);
 STATIC union node *makename(void);
 STATIC void parsefname(void);
-STATIC void parseheredoc(void);
+STATIC void slurp_heredoc(char *const, int, int);
+STATIC void readheredocs(void);
 STATIC int peektoken(void);
 STATIC int readtoken(void);
 STATIC int xxreadtoken(void);
-STATIC int readtoken1(int, char const *, char *, int);
+STATIC int readtoken1(int, char const *, int);
 STATIC int noexpand(char *);
 STATIC void synexpect(int, const char *) __dead;
 STATIC void synerror(const char *) __dead;
@@ -196,7 +195,7 @@
 			/* FALLTHROUGH */
 		case TNL:
 			if (tok == TNL) {
-				parseheredoc();
+				readheredocs();
 				if (nlflag)
 					return n1;
 			} else {
@@ -208,7 +207,7 @@
 			break;
 		case TEOF:
 			if (heredoclist)
-				parseheredoc();
+				readheredocs();
 			else
 				pungetc();	/* push back EOF on input */
 			return n1;
@@ -671,7 +670,6 @@
 	if (n->type == NHERE) {
 		struct heredoc *here = heredoc;
 		struct heredoc *p;
-		int i;
 
 		if (quoteflag == 0)
 			n->type = NXHERE;
@@ -680,8 +678,21 @@
 			while (*wordtext == '\t')
 				wordtext++;
 		}
-		if (! noexpand(wordtext) || (i = strlen(wordtext)) == 0 || i > EOFMARKLEN)
+
+		/*
+		 * this test is not really necessary, we are not
+		 * required to expand wordtext, but there's no reason
+		 * it cannot be $$ or something like that - that would
+		 * not mean the pid, but literally two '$' characters.
+		 * There is no need for limits on what the word can be.
+		 * However, it needs to stay literal as entered, not
+		 * have $ converted to CTLVAR or something, which as
+		 * the parser is, at the minute, is impossible to prevent.
+		 * So, leave it like this until the rest of the parser is fixed.
+		 */
+		if (! noexpand(wordtext))
 			synerror("Illegal eof marker for << redirection");
+
 		rmescapes(wordtext);
 		here->eofmark = wordtext;
 		here->next = NULL;
@@ -699,32 +710,142 @@
 	}
 }
 
+/*
+ * Check to see whether we are at the end of the here document.  When this
+ * is called, c is set to the first character of the next input line.  If
+ * we are at the end of the here document, this routine sets the c to PEOF.
+ * The new value of c is returned.
+ */
 
+static int
+checkend(int c, char * const eofmark, const int striptabs)
+{
+	if (striptabs) {
+		while (c == '\t')
+			c = pgetc();
+	}
+	if (c == PEOF) {
+		if (*eofmark == '\0')
+			return (c);
+		synerror(EOFhere);
+	}
+	if (c == *eofmark) {
+		int c2;
+		char *q;
+
+		for (q = eofmark + 1; c2 = pgetc(), *q != '\0' && c2 == *q; q++)
+			;
+		if ((c2 == PEOF || c2 == '\n') && *q == '\0') {
+			c = PEOF;
+			if (c2 == '\n') {
+				plinno++;
+				needprompt = doprompt;
+			}
+		} else {
+			pungetc();
+			pushstring(eofmark + 1, q - (eofmark + 1), NULL);
+		}
+	} else if (c == '\n' && *eofmark == '\0') {
+		c = PEOF;
+		plinno++;
+		needprompt = doprompt;
+	}
+	return (c);
+}
+
+
 /*
  * Input any here documents.
  */
 
 STATIC void
-parseheredoc(void)
+slurp_heredoc(char *const eofmark, int striptabs, int sq)
 {
+	int c;
+	char *out;
+
+	c = pgetc();
+
+	/*
+	 * If we hit EOF on the input, and the eofmark is a null string ('')
+	 * we consider this empty line to be the eofmark, and exit without err.
+	 */
+	if (c == PEOF && *eofmark != '\0')
+		synerror(EOFhere);
+
+	STARTSTACKSTR(out);
+
+	while ((c = checkend(c, eofmark, striptabs)) != PEOF) {
+		do {
+			if (sq) {
+				/*
+				 * in single quoted mode (eofmark quoted)
+				 * all we look for is \n so we can check
+				 * for the epfmark - everything saved literally.
+				 */
+				STPUTC(c, out);
+				if (c == '\n')
+					break;
+				continue;
+			}
+			/*
+			 * In double quoted (non-quoted eofmark)
+			 * we must handle \ followed by \n here
+			 * otherwise we can mismatch the end mark.
+			 * All other uses of \ will be handled later
+			 * when the here doc is expanded.
+			 *
+			 * This also makes sure \\ followed by \n does
+			 * not suppress the newline (the \ quotes itself)
+			 */
+			if (c == '\\') {		/* A backslash */
+				c = pgetc();		/* followed by */
+				if (c == '\n')		/* a newline?  */
+					continue;	/* y:drop both */
+				STPUTC('\\', out);	/* else keep \ */
+			}
+			STPUTC(c, out);			/* keep the char */
+			if (c == '\n')			/* at end of line */
+				break;			/* look for eofmark */
+
+		} while ((c = pgetc()) != PEOF);
+
+		/*
+		 * If we have read a line, and reached EOF, without
+		 * finding the eofmark, whether the EOF comes before
+		 * or immediately after the \n, that is an error.
+		 */
+		if (c == PEOF || (c = pgetc()) == PEOF)
+			synerror(EOFhere);
+	}
+	STPUTC('\0', out);
+
+	c = out - stackblock();
+	out = stackblock();
+	grabstackblock(c);
+	wordtext = out;
+
+	TRACE(("Slurped a heredoc (to '%s')%s: len %d, \"%.16s\"...\n",
+		eofmark, striptabs ? " tab stripped" : "", c, wordtext));
+}
+
+STATIC void
+readheredocs(void)
+{
 	struct heredoc *here;
 	union node *n;
 
 	while (heredoclist) {
-		int c;
-
 		here = heredoclist;
 		heredoclist = here->next;
 		if (needprompt) {
 			setprompt(2);
 			needprompt = 0;
 		}
-		if ((c = pgetc()) == PEOF) {
-			synerror(EOFhere);
-			/* NOTREACHED */
-		}
-		readtoken1(c, here->here->type == NHERE? SQSYNTAX : DQSYNTAX,
-		    here->eofmark, here->striptabs);
+
+		slurp_heredoc(here->eofmark, here->striptabs,
+		    here->here->nhere.type == NHERE);
+
 		n = stalloc(sizeof(struct narg));
 		n->narg.type = NARG;
 		n->narg.next = NULL;
@@ -734,6 +855,25 @@
 	}
 }
 
+void
+parse_heredoc(union node *n)
+{
+	if (n->narg.type != NARG)
+		abort();
+
+	if (n->narg.text[0] == '\0')		/* nothing to do */
+		return;
+
+	setinputstring(n->narg.text, 1);
+
+	readtoken1(pgetc(), DQSYNTAX, 1);
+
+	n->narg.text = wordtext;
+	n->narg.backquote = backquotelist;
+
+	popfile();
+}
+
 STATIC int
 peektoken(void)
 {
@@ -764,7 +904,7 @@
 		if (checkkwd == 2) {
 			checkkwd = 0;
 			while (t == TNL) {
-				parseheredoc();
+				readheredocs();
 				t = xxreadtoken();
 			}
 		} else
@@ -887,7 +1027,7 @@
 			}
 			/* FALLTHROUGH */
 		default:
-			return readtoken1(c, BASESYNTAX, NULL, 0);
+			return readtoken1(c, BASESYNTAX, 0);
 		}
 	}
 #undef RETURN
@@ -1039,7 +1179,6 @@
 	}
 }
 
-#define	CHECKEND()	{goto checkend; checkend_return:;}
 #define	PARSEREDIR()	{goto parseredir; parseredir_return:;}
 #define	PARSESUB()	{goto parsesub; parsesub_return:;}
 #define	PARSEARITH()	{goto parsearith; parsearith_return:;}
@@ -1232,12 +1371,11 @@
 }
 
 STATIC int
-readtoken1(int firstc, char const *syn, char *eofmark, int striptabs)
+readtoken1(int firstc, char const *syn, int magicq)
 {
 	int c = firstc;
 	char * out;
 	int len;
-	char line[EOFMARKLEN + 1];
 	struct nodelist *bqlist;
 	int quotef;
 	VSS static_stack;
@@ -1260,7 +1398,6 @@
 
 	STARTSTACKSTR(out);
 	loop: {	/* for each line, until end of word */
-		CHECKEND();	/* set c to PEOF if at end of here document */
 		for (;;) {	/* until end of line or end of word */
 			CHECKSTRSPACE(4, out);	/* permit 4 calls to USTPUTC */
 			switch(syntax[c]) {
@@ -1279,7 +1416,7 @@
 				USTPUTC(c, out);
 				break;
 			case CCTL:
-				if (eofmark == NULL || ISDBLQUOTE())
+				if (!magicq || ISDBLQUOTE())
 					USTPUTC(CTLESC, out);
 				USTPUTC(c, out);
 				break;
@@ -1301,11 +1438,11 @@
 				quotef = 1;
 				if (ISDBLQUOTE() && c != '\\' &&
 				    c != '`' && c != '$' &&
-				    (c != '"' || eofmark != NULL))
+				    (c != '"' || magicq))
 					USTPUTC('\\', out);
 				if (SQSYNTAX[c] == CCTL)
 					USTPUTC(CTLESC, out);
-				else if (eofmark == NULL) {
+				else if (!magicq) {
 					USTPUTC(CTLQUOTEMARK, out);
 					USTPUTC(c, out);
 					if (varnest != 0)
@@ -1316,7 +1453,7 @@
 				break;
 			case CSQUOTE:
 				if (syntax != SQSYNTAX) {
-					if (eofmark == NULL)
+					if (!magicq)
 						USTPUTC(CTLQUOTEMARK, out);
 					quotef = 1;
 					TS_PUSH();
@@ -1324,8 +1461,7 @@
 					quoted = SQ;
 					break;
 				}
-				if (eofmark != NULL && arinest == 0 &&
-				    varnest == 0) {
+				if (magicq && arinest == 0 && varnest == 0) {
 					/* Ignore inside quoted here document */
 					USTPUTC(c, out);
 					break;
@@ -1336,8 +1472,7 @@
 					USTPUTC(CTLQUOTEEND, out);
 				break;
 			case CDQUOTE:
-				if (eofmark != NULL && arinest == 0 &&
-				    varnest == 0) {
+				if (magicq && arinest == 0 && varnest == 0) {
 					/* Ignore inside here document */
 					USTPUTC(c, out);
 					break;
@@ -1354,7 +1489,7 @@
 					}
 					break;
 				}
-				if (eofmark != NULL)
+				if (magicq)
 					break;
 				if (ISDBLQUOTE()) {
 					TS_POP();
@@ -1421,7 +1556,7 @@
 		cleanup_state_stack(stack);
 		synerror("Missing '))'");
 	}
-	if (syntax != BASESYNTAX && /* ! parsebackquote && */ eofmark == NULL) {
+	if (syntax != BASESYNTAX && /* ! parsebackquote && */ !magicq) {
 		cleanup_state_stack(stack);
 		synerror("Unterminated quoted string");
 	}
@@ -1434,8 +1569,8 @@
 	USTPUTC('\0', out);
 	len = out - stackblock();
 	out = stackblock();
-	if (eofmark == NULL) {
-		if ((c == '>' || c == '<')
+	if (!magicq) {
+		if ((c == '<' || c == '>')
 		 && quotef == 0
 		 && (*out == '\0' || is_number(out))) {
 			PARSEREDIR();
@@ -1452,43 +1587,6 @@
 	cleanup_state_stack(stack);
 	return lasttoken = TWORD;
 /* end of readtoken routine */
-
-
-
-/*
- * Check to see whether we are at the end of the here document.  When this
- * is called, c is set to the first character of the next input line.  If
- * we are at the end of the here document, this routine sets the c to PEOF.
- */
-
-checkend: {
-	if (eofmark) {
-		if (c == PEOF)
-			synerror(EOFhere);
-		if (striptabs) {
-			while (c == '\t')
-				c = pgetc();
-		}
-		if (c == *eofmark) {
-			if (pfgets(line, sizeof line) != NULL) {
-				char *p, *q;
-
-				p = line;
-				for (q = eofmark + 1 ; *q && *p == *q ; p++, q++)
-					continue;
-				if ((*p == '\0' || *p == '\n') && *q == '\0') {
-					c = PEOF;
-					plinno++;
-					needprompt = doprompt;
-				} else {
-					pushstring(line, strlen(line), NULL);
-				}
-			} else
-				synerror(EOFhere);
-		}
-	}
-	goto checkend_return;
-}
 
 
 /*

cvs diff -r1.19 -r1.20 src/bin/sh/parser.h (expand / switch to context diff)
--- src/bin/sh/parser.h 2016/02/22 20:02:00 1.19
+++ src/bin/sh/parser.h 2016/03/27 14:39:33 1.20
@@ -1,4 +1,4 @@
-/*	$NetBSD: parser.h,v 1.19 2016/02/22 20:02:00 christos Exp $	*/
+/*	$NetBSD: parser.h,v 1.20 2016/03/27 14:39:33 christos Exp $	*/
 
 /*-
  * Copyright (c) 1991, 1993
@@ -81,5 +81,6 @@
 
 union node *parsecmd(int);
 void fixredir(union node *, const char *, int);
+void parse_heredoc(union node *);
 int goodname(char *);
 const char *getprompt(void *);