Content-Transfer-Encoding: 7bit
Content-Type: multipart/mixed; boundary="_----------=_159742629573210"
MIME-Version: 1.0
Date: Fri, 14 Aug 2020 17:31:35 +0000
From: "Sean Cole" <scole@netbsd.org>
Subject: CVS commit: pkgsrc/textproc/split-thai
To: pkgsrc-changes@NetBSD.org
Reply-To: scole@netbsd.org
Message-Id: <20200814173135.22C09FB28@cvs.NetBSD.org>
Sender: pkgsrc-changes-owner@NetBSD.org
Precedence: bulk

This is a multi-part message in MIME format.

--_----------=_159742629573210
Content-Disposition: inline
Content-Transfer-Encoding: 8bit
Content-Type: text/plain; charset="US-ASCII"

Module Name:	pkgsrc
Committed By:	scole
Date:		Fri Aug 14 17:31:35 UTC 2020

Modified Files:
	pkgsrc/textproc/split-thai: Makefile PLIST
	pkgsrc/textproc/split-thai/files: README.txt st-emacs thai-utility.el

Log Message:
Update to version 0.2

- generate the emacs dictionary once at build time, not every time the
  program is run
- clean up the README


To generate a diff of this commit:
cvs rdiff -u -r1.1 -r1.2 pkgsrc/textproc/split-thai/Makefile \
    pkgsrc/textproc/split-thai/PLIST
cvs rdiff -u -r1.1 -r1.2 pkgsrc/textproc/split-thai/files/README.txt \
    pkgsrc/textproc/split-thai/files/st-emacs \
    pkgsrc/textproc/split-thai/files/thai-utility.el

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.


--_----------=_159742629573210
Content-Disposition: inline
Content-Length: 10059
Content-Transfer-Encoding: binary
Content-Type: text/x-diff; charset=utf-8

Modified files:

Index: pkgsrc/textproc/split-thai/Makefile
diff -u pkgsrc/textproc/split-thai/Makefile:1.1 pkgsrc/textproc/split-thai/Makefile:1.2
--- pkgsrc/textproc/split-thai/Makefile:1.1	Thu Aug 13 20:52:09 2020
+++ pkgsrc/textproc/split-thai/Makefile	Fri Aug 14 17:31:34 2020
@@ -1,6 +1,6 @@
-# $NetBSD: Makefile,v 1.1 2020/08/13 20:52:09 scole Exp $
+# $NetBSD: Makefile,v 1.2 2020/08/14 17:31:34 scole Exp $
 
-PKGNAME=	split-thai-0.1
+PKGNAME=	split-thai-0.2
 CATEGORIES=	textproc
 MAINTAINER=	pkgsrc-users@NetBSD.org
 COMMENT=	Utilities to split UTF-8 Thai text into words
@@ -26,6 +26,9 @@ UTF8_ENV=	env LC_ALL=C.UTF-8
 ST_SHARE_DIR=		share/split-thai
 INSTALLATION_DIRS=	bin ${ST_SHARE_DIR}
 
+ST_SHARE_FILES=		README.txt thaidict thai-dict.el thai-dict.elc
+ST_SHARE_FILES+=	thai-utility.el thai-utility.elc thaidict.tri
+
 # xxx REPLACE_EMACS_SCRIPT
 SUBST_CLASSES+=			st-emacs-app
 SUBST_STAGE.st-emacs-app=	pre-configure
@@ -47,7 +50,7 @@ pre-extract:
 post-extract:
 	cd ${WRKSRC} && ${UTF8_ENV} emacs --batch \
 		-f batch-byte-compile thai-utility.el
-	cd ${WRKSRC} && ${UTF8_ENV} emacs --batch -l thai-utility.el \
+	cd ${WRKSRC} && ${UTF8_ENV} emacs --batch -l thai-utility.elc \
 		--eval '(thai-word-table-save "emacs-dict")'
 	cp ${WRKDIR}/${DISTFILES} ${WRKSRC}/icu-dict
 	cd ${PREFIX}/share/swath && \
@@ -58,6 +61,10 @@ post-extract:
 			grep -v '#' | sort | uniq > thaidict
 	cd ${WRKSRC} && \
 		${UTF8_ENV} trietool thaidict add-list -e utf-8 thaidict
+	cd ${WRKSRC} && ${UTF8_ENV} emacs --batch -l thai-utility.elc \
+		--eval '(thai-word-table-save-defvar "thaidict" "thai-dict.el")'
+	cd ${WRKSRC} && ${UTF8_ENV} emacs --batch \
+		-f batch-byte-compile thai-dict.el
 .for i in emacs-dict icu-dict swath-dict
 	@${ECHO} `wc -l ${WRKSRC}/${i} | awk '{print $$1}'` words in ${i}
 .endfor
@@ -73,7 +80,7 @@ do-install:
 	${INSTALL_SCRIPT} ${WRKSRC}/st-emacs ${WRKSRC}/st-swath \
 		${DESTDIR}${PREFIX}/bin
 	${INSTALL_PROGRAM} ${WRKSRC}/st-icu ${DESTDIR}${PREFIX}/bin
-.for i in README.txt thaidict thai-utility.el thai-utility.elc thaidict.tri
+.for i in ${ST_SHARE_FILES}
 	${INSTALL_DATA} ${WRKSRC}/${i} ${DESTDIR}${PREFIX}/share/split-thai
 .endfor
 
Index: pkgsrc/textproc/split-thai/PLIST
diff -u pkgsrc/textproc/split-thai/PLIST:1.1 pkgsrc/textproc/split-thai/PLIST:1.2
--- pkgsrc/textproc/split-thai/PLIST:1.1	Thu Aug 13 20:52:09 2020
+++ pkgsrc/textproc/split-thai/PLIST	Fri Aug 14 17:31:34 2020
@@ -1,8 +1,10 @@
-@comment $NetBSD: PLIST,v 1.1 2020/08/13 20:52:09 scole Exp $
+@comment $NetBSD: PLIST,v 1.2 2020/08/14 17:31:34 scole Exp $
 bin/st-emacs
 bin/st-icu
 bin/st-swath
 share/split-thai/README.txt
+share/split-thai/thai-dict.el
+share/split-thai/thai-dict.elc
 share/split-thai/thai-utility.el
 share/split-thai/thai-utility.elc
 share/split-thai/thaidict

Index: pkgsrc/textproc/split-thai/files/README.txt
diff -u pkgsrc/textproc/split-thai/files/README.txt:1.1 pkgsrc/textproc/split-thai/files/README.txt:1.2
--- pkgsrc/textproc/split-thai/files/README.txt:1.1	Thu Aug 13 20:52:09 2020
+++ pkgsrc/textproc/split-thai/files/README.txt	Fri Aug 14 17:31:34 2020
@@ -1,49 +1,70 @@
-This is a collection of utilities to separate Thai words by spaces
-(word tokenization).  They can separate stdin, files, or text as
-arguments.  It includes 3 separate utilities:
-
-st-emacs:  emacs-script using emacs lisp thai-word library
-           https://www.gnu.org/software/emacs/
-st-icu:    basic C++ program using the ICU library
-           http://site.icu-project.org/
-st-swath:  sh script wrapper to simplfy args to the swath program
-           https://linux.thai.net/projects/swath
-
-All scripts should be able to take a filename, stdin, or arguments as
-input, e.g., :
+NAME
+     st-emacs
+     st-icu
+     st-swath
+
+SYNOPSIS
+     st-emacs|st-icu|st-swath [filename|text1 text2 ...|'blank']
+
+DESCRIPTION
+     This package is a collection of utilities to separate Thai words
+     by spaces (word tokenization).  They can separate stdin, files,
+     or text as arguments.  It includes 3 separate utilities:
+
+     st-emacs:  emacs-script using emacs lisp thai-word library
+                https://www.gnu.org/software/emacs/
+     st-icu:    basic C++ program using the ICU library
+                http://site.icu-project.org/
+     st-swath:  sh script wrapper to simplfy args to the swath program
+                https://linux.thai.net/projects/swath
 
+EXAMPLES
+      split one or more text strings
       # st-swath แมวและหมา
-or
-      # echo "แมวและหมา" | st-swath
-or      
-      # st-swath < thaifile.txt
-or
       # st-swath "แมวหมา" พ่อและแม่
       
-You will most likely need to set LC_ALL or LC_CTYPE to an approriate
-unicode value, e.g., en_US.UTF-8 or C.UTF-8, in the environment for
-them to work properly.  These tools are setup to only support UTF-8
-encodings.
-
-Note that it is not possible to split Thai words 100% accurately
-without context and meaning.  These programs use dictionary-based word
-splitting.
-
-Also included in the package is a combined thai word dictionary and
-corresponding .tri file, and emacs lisp .el file for reading and
-dumping out dictionary files.
-
-st-emacs and st-swath are setup to use the combined dictionary with
-words from the emacs 'thai-word library, swath dictionary words, and
-the icu thai library words.
-
-st-icu uses its own built in library.  To customise the icu
-dictionary, you apparently would have to modify
-  icu4c/source/data/brkitr/dictionaries/thaidict.txt
-and rebuild icu library, and then rebuild the whole thing.
-
-There is also 
+      read stdin
+      # echo "แมวและหมา" | st-swath
 
-See also swath(1), libthai(1), emacs(1), locale(1), uconv(1), iconv(1)
+      read from a file
+      # st-swath < thaifile.txt
+      # st-swath somefile.txt
 
-TODO - fix st-icu to use all the combined dictionary words.
+      They can also read directly from stdin
+      # st-icu
+        แมวหมา   (typed in)
+        แมว หมา  (output line by line)
+
+ENVIRONMENT
+     You will most likely need to set the environment variables LC_ALL
+     or LC_CTYPE for proper unicode handling, e.g., en_US.UTF-8 or
+     C.UTF-8.  These tools are only setup to handle UTF-8 encodings.
+
+EXIT STATUS
+     0 for success, non zero otherwise
+
+NOTES
+     Note that it is not possible to split Thai words 100% accurately
+     without context and meaning.  All these programs use
+     dictionary-based word splitting.
+
+     Also included in the package is a combined thai word dictionary
+     and corresponding .tri file, and emacs lisp .el files for reading
+     and dumping out dictionary files.
+
+     st-emacs and st-swath are setup to use the combined dictionary
+     with words from the emacs 'thai-word library, swath dictionary
+     words, and the icu thai library words.
+
+     st-icu uses its own built in library.  To customise the icu
+     dictionary, you apparently would have to modify
+     icu4c/source/data/brkitr/dictionaries/thaidict.txt and then
+     rebuild the whole library.
+
+SEE ALSO
+     swath(1), libthai(1), emacs(1), locale(1), uconv(1), iconv(1)
+
+BUGS
+     st-icu should also use the combined dictionary words.
+     st-emacs and st-icu don't always split thai numbers well.
+     this file should be converted to a proper manpage.
Index: pkgsrc/textproc/split-thai/files/st-emacs
diff -u pkgsrc/textproc/split-thai/files/st-emacs:1.1 pkgsrc/textproc/split-thai/files/st-emacs:1.2
--- pkgsrc/textproc/split-thai/files/st-emacs:1.1	Thu Aug 13 20:52:09 2020
+++ pkgsrc/textproc/split-thai/files/st-emacs	Fri Aug 14 17:31:34 2020
@@ -12,7 +12,7 @@
 
 ;; load custom dictionary
 (load "ST_SHARE_DIR/thai-utility" nil t)
-(thai-update-word-table-utf8 "ST_SHARE_DIR/thaidict")
+(load "ST_SHARE_DIR/thai-dict" nil t)
 
 ;; split a thai line by spaces, return new line
 (defun process-thai-line(line)
Index: pkgsrc/textproc/split-thai/files/thai-utility.el
diff -u pkgsrc/textproc/split-thai/files/thai-utility.el:1.1 pkgsrc/textproc/split-thai/files/thai-utility.el:1.2
--- pkgsrc/textproc/split-thai/files/thai-utility.el:1.1	Thu Aug 13 20:52:09 2020
+++ pkgsrc/textproc/split-thai/files/thai-utility.el	Fri Aug 14 17:31:34 2020
@@ -43,8 +43,8 @@ uses recursion"
 
 (defun thai-word-table-save(filename &optional alist)
   "save thai words extracted from a nested-alist table to
-filename in utf8 format.  default is to save 'thai-word-table if
-no alist argument given."
+filename in utf8 format, one word per line.  default is to save
+'thai-word-table if no alist argument given."
   (interactive)
   (let ((thaiwords)
 	(elem)
@@ -95,3 +95,44 @@ is appended instead to the current word 
       (thai-update-word-table temp_file append)
       (delete-file temp_file)
       thai-word-table)))
+
+(defun thai-word-table-save-defvar(dictfile lispfile)
+  "read a utf8 thai dictionary file and save to a lisp file
+suitable for initializing the 'thai-word-table as a \"defvar\".
+Overwrites the lisp file if it exists."
+  (interactive)
+  (let ((header)
+	(footer)
+	(elem)
+	(coding-system-for-read 'utf-8)
+	(coding-system-for-write 'utf-8)
+	(buffer-file-coding-system 'utf-8))
+    (setq header (list "(defvar thai-word-table"
+		       "(let ((table (list 'thai-words)))"
+		       "(dolist (elt"
+		       "'(" ))
+    (setq footer (list "))"
+		       "(set-nested-alist elt 1 table))"
+		       "table)"
+		       "\"Nested alist of Thai words.\")" ))
+    (with-temp-buffer
+      (insert-file-contents dictfile)
+      (goto-char (point-min))
+      ;; quote each thai word
+      (while (not (eobp))
+	(beginning-of-line)
+	(insert "\"")
+	(end-of-line)
+	(insert "\"")
+	(forward-line 1))
+
+      (goto-char (point-min))
+      (dolist (elem header)
+	(insert elem "\n"))
+
+      (goto-char (point-max))
+      (dolist (elem footer)
+	(insert elem "\n"))
+      (lisp-mode)
+      (indent-region (point-min) (point-max))
+      (write-region nil nil lispfile))))


--_----------=_159742629573210--