From zxycba2004@yahoo.com  Sat Jan 22 21:18:41 2005
Return-Path: <zxycba2004@yahoo.com>
Received: from mx1.FreeBSD.org (mx1.freebsd.org [216.136.204.125])
	by hub.freebsd.org (Postfix) with ESMTP id C51D916A4CE
	for <FreeBSD-gnats-submit@freebsd.org>; Sat, 22 Jan 2005 21:18:41 +0000 (GMT)
Received: from web53405.mail.yahoo.com (web53405.mail.yahoo.com [206.190.37.52])
	by mx1.FreeBSD.org (Postfix) with SMTP id 532F343D3F
	for <FreeBSD-gnats-submit@freebsd.org>; Sat, 22 Jan 2005 21:18:41 +0000 (GMT)
	(envelope-from zxycba2004@yahoo.com)
Received: (qmail 83529 invoked by uid 60001); 22 Jan 2005 21:18:40 -0000
Received: from [38.119.184.14] by web53405.mail.yahoo.com via HTTP; Sat, 22 Jan 2005 13:18:40 PST
Message-Id: <20050122211840.83527.qmail@web53405.mail.yahoo.com>
Date: Sat, 22 Jan 2005 13:18:40 -0800 (PST)
From: "Scot W. Hetzel" <zxycba2004@yahoo.com>
Reply-To: swhetzel@gmail.com
To: FreeBSD-gnats-submit@freebsd.org
Subject: uniq truncates long lines to LINE_MAX (2048)

>Number:         76578
>Category:       bin
>Synopsis:       [patch] uniq(1) truncates long lines to LINE_MAX
>Confidential:   no
>Severity:       serious
>Priority:       low
>Responsible:    ghelmer
>State:          closed
>Quarter:        
>Keywords:       
>Date-Required:  
>Class:          sw-bug
>Submitter-Id:   current-users
>Arrival-Date:   Sat Jan 22 21:20:37 GMT 2005
>Closed-Date:    Fri Feb 08 17:06:12 CST 2008
>Last-Modified:  Thu Mar  6 15:10:06 UTC 2008
>Originator:     Scot W. Hetzel
>Release:        FreeBSD 5.3-STABLE i386
>Organization:
>Environment:
System: FreeBSD bsd5x.hetzel.org 5.3-STABLE FreeBSD
5.3-STABLE #6: Sun Jan 16 03:44:45 CST 2005
root@bsd5x.hetzel.org:/usr/obj/usr/src/5x/sys/GENERIC
i386

>Description:
I noticed that if you have a file with long lines
(>2048 bytes), uniq(1) will
truncate the line to 2048 bytes.  This truncation is
caused by the getline
routine in src/usr.bin/uniq/uniq.c, where if it
reaches the buflen-2, but not
the end of a line, it discards the remainder of the
line or until it gets WEOF.

I consider this a bug, as uniq(1) shouldn't be
discarding the end of a line.

If sort(1) is used on the same file, the line isn't
truncated.

>How-To-Repeat:
A simple way to repeat is to do the following:

  cd /usr/ports/accessibility/gnomemag
  make fetch-list > tmp.list
  make fetch-list >> tmp.list
  uniq tmp.list > tmp2.list
  ls -l tmp*list

  -rw-r--r-- 1 root wheel 4540 Jan 22 03:31 tmp.list
  -rw-r--r-- 1 root wheel 2048 Jan 22 03:32 tmp2.list

tmp2.list should be half the size of tmp.list, but
instead its 2048 bytes.

NOTE: make sure you don't have the distfile for
gnomemag before using fetch-list

>Fix:
The following patch makes it so that uniq no-longer
truncates long lines.

NOTE: I haven't figured out how to pass 'buflen' to
'thisbuflen' or 'prevbuflen'
from getline.  This will cause getline to realloc
thisline again, when the line
is greater than LINE_MAX, instead of greater than new
buflen/sizeof(*thisline).

Index: uniq.c
===================================================================
RCS file: /home/ncvs/src/usr.bin/uniq/uniq.c,v
retrieving revision 1.25
diff -u -r1.25 uniq.c
--- uniq.c	2 Jul 2004 23:43:05 -0000	1.25
+++ uniq.c	22 Jan 2005 09:14:38 -0000
@@ -50,6 +50,7 @@
 
 #include <ctype.h>
 #include <err.h>
+#include <errno.h>
 #include <limits.h>
 #include <locale.h>
 #include <stdio.h>
@@ -71,13 +72,15 @@
 void	 obsolete(char *[]);
 static void	 usage(void);
 int      wcsicoll(wchar_t *, wchar_t *);
+static wchar_t *__wcsalloc(wchar_t *);
 
 int
 main (int argc, char *argv[])
 {
 	wchar_t *t1, *t2;
 	FILE *ifp, *ofp;
-	int ch;
+	int ch, b1;
+	size_t prevbuflen, thisbuflen;
 	wchar_t *prevline, *thisline;
 	char *p;
 	const char *ifn;
@@ -136,20 +139,24 @@
 	if (argc > 1)
 		ofp = file(argv[1], "w");
 
-	prevline = malloc(MAXLINELEN * sizeof(*prevline));
-	thisline = malloc(MAXLINELEN * sizeof(*thisline));
+	prevbuflen = MAXLINELEN * sizeof(*prevline);
+	thisbuflen = MAXLINELEN * sizeof(*thisline);
+	prevline = malloc(prevbuflen);
+	thisline = malloc(thisbuflen);
+
 	if (prevline == NULL || thisline == NULL)
 		err(1, "malloc");
 
-	if (getline(prevline, MAXLINELEN, ifp) == NULL) {
+	if (getline(prevline, prevbuflen, ifp) == NULL) {
 		if (ferror(ifp))
 			err(1, "%s", ifp == stdin ? "stdin" : argv[0]);
 		exit(0);
 	}
+
 	if (!cflag && uflag && dflag)
 		show(ofp, prevline);
 
-	while (getline(thisline, MAXLINELEN, ifp)) {
+	while (getline(thisline, thisbuflen, ifp)) {
 		/* If requested get the chosen fields + character
offsets. */
 		if (numfields || numchars) {
 			t1 = skip(thisline);
@@ -169,10 +176,13 @@
 			if (cflag || !dflag || !uflag)
 				show(ofp, prevline);
 			t1 = prevline;
+			b1 = prevbuflen;
 			prevline = thisline;
+			prevbuflen = thisbuflen;
 			if (!cflag && uflag && dflag)
 				show(ofp, prevline);
 			thisline = t1;
+			thisbuflen = b1;
 			repeats = 0;
 		} else
 			++repeats;
@@ -191,12 +201,15 @@
 	wint_t ch;
 
 	bufpos = 0;
-	while (bufpos + 2 != buflen && (ch = getwc(fp)) !=
WEOF && ch != '\n')
+	while ((ch = getwc(fp)) != WEOF && ch != '\n') {
+		if ((bufpos + 1) == buflen) {
+			buflen = buflen + (1024 * sizeof(*buf));
+			buf = realloc(buf,buflen);
+		}
 		buf[bufpos++] = ch;
+	}
 	if (bufpos + 1 != buflen)
 		buf[bufpos] = '\0';
-	while (ch != WEOF && ch != '\n')
-		ch = getwc(fp);
 
 	return (bufpos != 0 || ch == '\n' ? buf : NULL);
 }
@@ -278,16 +291,42 @@
 	exit(1);
 }
 
+static wchar_t *
+__wcsalloc(wchar_t *ws)
+{
+	wchar_t *wcs;
+
+	if ((wcs = malloc(wcslen(ws) + 1)) == NULL)
+		return (NULL);
+
+	return (wcs);
+}
+
 int
 wcsicoll(wchar_t *s1, wchar_t *s2)
 {
-	wchar_t *p, line1[MAXLINELEN], line2[MAXLINELEN];
+	wchar_t *p, *l1, *l2;
+	int diff, sverrno;
+
+	if ((l1 = __wcsalloc(s1)) == NULL || (l2 =
__wcsalloc(s2)) == NULL) {
+		sverrno = errno;
+		free(l1);
+		errno = sverrno;
+		return(wcscmp(s1,s2));
+	}
 
-	for (p = line1; *s1; s1++)
+	for (p = l1; *s1; s1++)
 		*p++ = towlower(*s1);
 	*p = '\0';
-	for (p = line2; *s2; s2++)
+	for (p = l2; *s2; s2++)
 		*p++ = towlower(*s2);
 	*p = '\0';
-	return (wcscoll(line1, line2));
+
+	diff = wcscoll(l1,l2);
+	sverrno = errno;
+	free(l1);
+	free(l2);
+	errno = sverrno;
+
+	return (diff);
 }



__________________________________________________
Do You Yahoo!?
Tired of spam?  Yahoo! Mail has the best spam protection around 
http://mail.yahoo.com 
>Release-Note:
>Audit-Trail:
Responsible-Changed-From-To: freebsd-bugs->ghelmer 
Responsible-Changed-By: ghelmer 
Responsible-Changed-When: Tue Jan 29 08:01:13 CST 2008 
Responsible-Changed-Why:  
I'll take this. 

http://www.freebsd.org/cgi/query-pr.cgi?pr=76578 

From: Guy Helmer <ghelmer@palisadesys.com>
To: bug-followup@FreeBSD.org, swhetzel@gmail.com
Cc:  
Subject: Re: bin/76578: [patch] uniq(1) truncates long lines to LINE_MAX
Date: Tue, 29 Jan 2008 17:34:17 -0600

 I've taken your changes and modified them somewhat to reduce the number 
 of memory allocations in wcsicoll(), plus pass the buffer length into 
 getline() as a pass-by-reference parameter so getline() doesn't have to 
 continually reallocate its buffer for long lines.  Would you be able to 
 test the changes and give me feedback?
 
 Thanks,
 Guy Helmer
 
 cvs diff: Diffing .
 Index: uniq.c
 ===================================================================
 RCS file: /home/ncvs/src/usr.bin/uniq/uniq.c,v
 retrieving revision 1.29
 diff -u -r1.29 uniq.c
 --- uniq.c      17 May 2007 00:19:56 -0000      1.29
 +++ uniq.c      25 Jan 2008 16:56:06 -0000
 @@ -50,6 +50,7 @@
 
  #include <ctype.h>
  #include <err.h>
 +#include <errno.h>
  #include <limits.h>
  #include <locale.h>
  #include <stdio.h>
 @@ -65,7 +66,7 @@
  int numchars, numfields, repeats;
 
  FILE   *file(const char *, const char *);
 -wchar_t        *getline(wchar_t *, size_t, FILE *);
 +wchar_t        *getline(wchar_t *, size_t *, FILE *);
  void    show(FILE *, wchar_t *);
  wchar_t        *skip(wchar_t *);
  void    obsolete(char *[]);
 @@ -77,7 +78,8 @@
  {
         wchar_t *t1, *t2;
         FILE *ifp, *ofp;
 -       int ch;
 +       int ch, b1;
 +       size_t prevbuflen, thisbuflen;
         wchar_t *prevline, *thisline;
         char *p;
         const char *ifn;
 @@ -136,12 +138,14 @@
         if (argc > 1)
                 ofp = file(argv[1], "w");
 
 -       prevline = malloc(MAXLINELEN * sizeof(*prevline));
 -       thisline = malloc(MAXLINELEN * sizeof(*thisline));
 +       prevbuflen = MAXLINELEN * sizeof(*prevline);
 +       thisbuflen = MAXLINELEN * sizeof(*thisline);
 +       prevline = malloc(prevbuflen);
 +       thisline = malloc(thisbuflen);
         if (prevline == NULL || thisline == NULL)
                 err(1, "malloc");
 
 -       if (getline(prevline, MAXLINELEN, ifp) == NULL) {
 +       if ((prevline = getline(prevline, &prevbuflen, ifp)) == NULL) {
                 if (ferror(ifp))
                         err(1, "%s", ifn);
                 exit(0);
 @@ -149,7 +153,7 @@
         if (!cflag && uflag && dflag)
                 show(ofp, prevline);
 
 -       while (getline(thisline, MAXLINELEN, ifp)) {
 +       while ((thisline = getline(thisline, &thisbuflen, ifp)) != NULL) {
                 /* If requested get the chosen fields + character offsets. */
                 if (numfields || numchars) {
                         t1 = skip(thisline);
 @@ -169,10 +173,13 @@
                         if (cflag || !dflag || !uflag)
                                 show(ofp, prevline);
                         t1 = prevline;
 +                       b1 = prevbuflen;
                         prevline = thisline;
 +                       prevbuflen = thisbuflen;
                         if (!cflag && uflag && dflag)
                                 show(ofp, prevline);
                         thisline = t1;
 +                       thisbuflen = b1;
                         repeats = 0;
                 } else
                         ++repeats;
 @@ -185,18 +192,23 @@
  }
 
  wchar_t *
 -getline(wchar_t *buf, size_t buflen, FILE *fp)
 +getline(wchar_t *buf, size_t *buflen, FILE *fp)
  {
         size_t bufpos;
         wint_t ch;
 
         bufpos = 0;
 -       while (bufpos + 2 != buflen && (ch = getwc(fp)) != WEOF && ch != '\n')
 +       while ((ch = getwc(fp)) != WEOF && ch != '\n') {
 +               if ((bufpos + 1) == *buflen) {
 +                       *buflen = *buflen + (1024 * sizeof(*buf));
 +                       buf = reallocf(buf, *buflen);
 +                       if (buf == NULL)
 +                               return (NULL);
 +               }
                 buf[bufpos++] = ch;
 -       if (bufpos + 1 != buflen)
 +       }
 +       if (bufpos + 1 != *buflen)
                 buf[bufpos] = '\0';
 -       while (ch != WEOF && ch != '\n')
 -               ch = getwc(fp);
 
         return (bufpos != 0 || ch == '\n' ? buf : NULL);
  }
 @@ -278,16 +290,55 @@
         exit(1);
  }
 
 +static size_t wcsicoll_l1_buflen = 0, wcsicoll_l2_buflen = 0;
 +static wchar_t *wcsicoll_l1_buf = NULL, *wcsicoll_l2_buf = NULL;
 +
  int
  wcsicoll(wchar_t *s1, wchar_t *s2)
  {
 -       wchar_t *p, line1[MAXLINELEN], line2[MAXLINELEN];
 +       wchar_t *p;
 +       size_t l1, l2;
 +       size_t new_l1_buflen, new_l2_buflen;
 +
 +       l1 = wcslen(s1) + 1;
 +       l2 = wcslen(s2) + 1;
 +       new_l1_buflen = wcsicoll_l1_buflen;
 +       new_l2_buflen = wcsicoll_l2_buflen;
 +       while (new_l1_buflen < l1) {
 +               if (new_l1_buflen == 0)
 +                       new_l1_buflen = MAXLINELEN * sizeof(wchar_t);
 +               else
 +                       new_l1_buflen *= 2;
 +       }
 +       while (new_l2_buflen < l2) {
 +               if (new_l2_buflen == 0)
 +                       new_l2_buflen = MAXLINELEN * sizeof(wchar_t);
 +               else
 +                       new_l2_buflen *= 2;
 +       }
 +       if (new_l1_buflen > wcsicoll_l1_buflen) {
 +               wcsicoll_l1_buf = realloc(wcsicoll_l1_buf, new_l1_buflen);
 +               if (wcsicoll_l1_buf == NULL) {
 +                       wcsicoll_l1_buflen = 0;
 +                       return (wcscmp(s1, s2));
 +               }
 +               wcsicoll_l1_buflen = new_l1_buflen;
 +       }
 +       if (new_l2_buflen > wcsicoll_l2_buflen) {
 +               wcsicoll_l2_buf = realloc(wcsicoll_l2_buf, new_l2_buflen);
 +               if (wcsicoll_l2_buf == NULL) {
 +                       wcsicoll_l2_buflen = 0;
 +                       return (wcscmp(s1, s2));
 +               }
 +               wcsicoll_l2_buflen = new_l2_buflen;
 +       }
 
 -       for (p = line1; *s1; s1++)
 +       for (p = wcsicoll_l1_buf; *s1; s1++)
                 *p++ = towlower(*s1);
         *p = '\0';
 -       for (p = line2; *s2; s2++)
 +       for (p = wcsicoll_l2_buf; *s2; s2++)
                 *p++ = towlower(*s2);
         *p = '\0';
 -       return (wcscoll(line1, line2));
 +
 +       return (wcscoll(wcsicoll_l1_buf, wcsicoll_l2_buf));
  }
 
 

From: "Scot Hetzel" <swhetzel@gmail.com>
To: "Guy Helmer" <ghelmer@palisadesys.com>
Cc: bug-followup@freebsd.org
Subject: Re: bin/76578: [patch] uniq(1) truncates long lines to LINE_MAX
Date: Tue, 29 Jan 2008 23:53:47 -0600

 On 1/29/08, Guy Helmer <ghelmer@palisadesys.com> wrote:
 > I've taken your changes and modified them somewhat to reduce the number
 > of memory allocations in wcsicoll(), plus pass the buffer length into
 > getline() as a pass-by-reference parameter so getline() doesn't have to
 > continually reallocate its buffer for long lines.  Would you be able to
 > test the changes and give me feedback?
 >
 I tested the change using the simple test as listed in the PR and it passed.
 
 Thanks for looking into this problem
 
 Scot
State-Changed-From-To: open->closed 
State-Changed-By: ghelmer 
State-Changed-When: Fri Feb 8 17:04:21 CST 2008 
State-Changed-Why:  
Committed in src/usr.bin/uniq/uniq.c rev 1.30 after 
stress testing and fixing a couple of bugs. 

http://www.freebsd.org/cgi/query-pr.cgi?pr=76578 

From: dfilter@FreeBSD.ORG (dfilter service)
To: bug-followup@FreeBSD.org
Cc:  
Subject: Re: bin/76578: commit references a PR
Date: Fri,  8 Feb 2008 23:04:20 +0000 (UTC)

 ghelmer     2008-02-08 23:04:13 UTC
 
   FreeBSD src repository
 
   Modified files:
     usr.bin/uniq         uniq.c 
   Log:
   Fix truncation of lines at LINE_MAX characters by dynamically
   extending line buffers.
   
   PR:             bin/76578
   
   Revision  Changes    Path
   1.30      +61 -15    src/usr.bin/uniq/uniq.c
 _______________________________________________
 cvs-all@freebsd.org mailing list
 http://lists.freebsd.org/mailman/listinfo/cvs-all
 To unsubscribe, send any mail to "cvs-all-unsubscribe@freebsd.org"
 

From: dfilter@FreeBSD.ORG (dfilter service)
To: bug-followup@FreeBSD.org
Cc:  
Subject: Re: bin/76578: commit references a PR
Date: Thu,  6 Mar 2008 15:02:20 +0000 (UTC)

 ghelmer     2008-03-06 15:02:05 UTC
 
   FreeBSD src repository
 
   Modified files:        (Branch: RELENG_7)
     usr.bin/uniq         uniq.c 
   Log:
   MFH: 1.30
   
   Fix truncation of lines at LINE_MAX characters by dynamically
   extending line buffers.
   
   PR:             bin/76578
   
   Revision  Changes    Path
   1.29.2.1  +61 -15    src/usr.bin/uniq/uniq.c
 _______________________________________________
 cvs-all@freebsd.org mailing list
 http://lists.freebsd.org/mailman/listinfo/cvs-all
 To unsubscribe, send any mail to "cvs-all-unsubscribe@freebsd.org"
 

From: dfilter@FreeBSD.ORG (dfilter service)
To: bug-followup@FreeBSD.org
Cc:  
Subject: Re: bin/76578: commit references a PR
Date: Thu,  6 Mar 2008 15:07:42 +0000 (UTC)

 ghelmer     2008-03-06 15:07:37 UTC
 
   FreeBSD src repository
 
   Modified files:        (Branch: RELENG_6)
     usr.bin/uniq         uniq.c 
   Log:
   MFC: 1.30
   
   Fix truncation of lines at LINE_MAX characters by dynamically
   extending line buffers.
   
   PR:             bin/76578
   
   Revision  Changes    Path
   1.26.2.1  +61 -15    src/usr.bin/uniq/uniq.c
 _______________________________________________
 cvs-all@freebsd.org mailing list
 http://lists.freebsd.org/mailman/listinfo/cvs-all
 To unsubscribe, send any mail to "cvs-all-unsubscribe@freebsd.org"
 
>Unformatted:
