@@ -1,81 +1,88 @@
-# $NetBSD: Makefile,v 1.1 2020/08/13 20:52:09 scole Exp $
+# $NetBSD: Makefile,v 1.2 2020/08/14 17:31:34 scole Exp $
-PKGNAME=	split-thai-0.1
+PKGNAME=	split-thai-0.2
 CATEGORIES=	textproc
 MAINTAINER=	pkgsrc-users@NetBSD.org
 COMMENT=	Utilities to split UTF-8 Thai text into words
 LICENSE=	public-domain AND mit AND gnu-gpl-v2 # code, icu dict, swath dict
 # xxx fetching a specific version of a file out of a github project
 EXTRACT_SUFX=	# none
 GITHUB_ICU_TAG=	61607c27732906d36c5bd4d23ecc092f89f53a2b
 DISTFILES=	thaidict-${GITHUB_ICU_TAG}.txt
 MASTER_SITES=	-${MASTER_SITE_GITHUB:=unicode-org/}/icu/raw/${GITHUB_ICU_TAG}/icu4c/source/data/brkitr/dictionaries/thaidict.txt
 USE_LANGUAGES=	c++11	# darwin needed 11?
 USE_TOOLS=	pkg-config mkdir cp sh:run env awk cat sort uniq grep wc echo
 BUILD_DEPENDS+=	libdatrie-[0-9]*:../../devel/libdatrie
 DEPENDS+=	emacs-[0-9]*:../../editors/emacs
 DEPENDS+=	swath-[0-9]*:../../textproc/swath
 REPLACE_SH=	st-swath
 UTF8_ENV=	env LC_ALL=C.UTF-8
 ST_SHARE_DIR=		share/split-thai
 INSTALLATION_DIRS=	bin ${ST_SHARE_DIR}
 ST_SHARE_FILES=		README.txt thaidict thai-dict.el thai-dict.elc
 ST_SHARE_FILES+=	thai-utility.el thai-utility.elc thaidict.tri
 # xxx REPLACE_EMACS_SCRIPT
 SUBST_CLASSES+=			st-emacs-app
 SUBST_STAGE.st-emacs-app=	pre-configure
 SUBST_MESSAGE.st-emacs-app=	Fixing emacs script paths.
 SUBST_FILES.st-emacs-app=	st-emacs
 SUBST_SED.st-emacs-app=		-e 's,!/bin/emacs,!${PREFIX}/bin/emacs,g'
 SUBST_CLASSES+=			dictionary-app
 SUBST_STAGE.dictionary-app=	pre-configure
 SUBST_MESSAGE.dictionary-app=	Fixing dictionary paths.
 SUBST_FILES.dictionary-app=	st-emacs st-swath
 SUBST_SED.dictionary-app=	-e 's,ST_SHARE_DIR,${PREFIX}/${ST_SHARE_DIR},g'
 pre-extract:
 	mkdir -p ${WRKSRC}
 	cd files && cp README.txt st-emacs st-icu.cc st-swath \
 		thai-utility.el thaidict.abm ${WRKSRC}
 post-extract:
 	cd ${WRKSRC} && ${UTF8_ENV} emacs --batch \
 		-f batch-byte-compile thai-utility.el
-	cd ${WRKSRC} && ${UTF8_ENV} emacs --batch -l thai-utility.el \
+	cd ${WRKSRC} && ${UTF8_ENV} emacs --batch -l thai-utility.elc \
 		--eval '(thai-word-table-save "emacs-dict")'
 	cp ${WRKDIR}/${DISTFILES} ${WRKSRC}/icu-dict
 	cd ${PREFIX}/share/swath && \
 		${UTF8_ENV} trietool swathdic list | \
 		awk '{print $$1}' > ${WRKSRC}/swath-dict
 	cd ${WRKSRC} && \
 		${UTF8_ENV} cat icu-dict swath-dict emacs-dict | \
 			grep -v '#' | sort | uniq > thaidict
 	cd ${WRKSRC} && \
 		${UTF8_ENV} trietool thaidict add-list -e utf-8 thaidict
 	cd ${WRKSRC} && ${UTF8_ENV} emacs --batch -l thai-utility.elc \
 		--eval '(thai-word-table-save-defvar "thaidict" "thai-dict.el")'
 	cd ${WRKSRC} && ${UTF8_ENV} emacs --batch \
 		-f batch-byte-compile thai-dict.el
 .for i in emacs-dict icu-dict swath-dict
 	@${ECHO} `wc -l ${WRKSRC}/${i} | awk '{print $$1}'` words in ${i}
 .endfor
 	@${ECHO} `wc -l ${WRKSRC}/thaidict | awk '{print $$1}'` \
 		unique words in combined dictionary
 do-build:
 	cd ${WRKSRC} &&	\
 		${CXX} ${CPPFLAGS} -o st-icu st-icu.cc \
 		`pkg-config --libs --cflags icu-io`
 do-install:
 	${INSTALL_SCRIPT} ${WRKSRC}/st-emacs ${WRKSRC}/st-swath \
 		${DESTDIR}${PREFIX}/bin
 	${INSTALL_PROGRAM} ${WRKSRC}/st-icu ${DESTDIR}${PREFIX}/bin
-.for i in README.txt thaidict thai-utility.el thai-utility.elc thaidict.tri
+.for i in ${ST_SHARE_FILES}
 	${INSTALL_DATA} ${WRKSRC}/${i} ${DESTDIR}${PREFIX}/share/split-thai
 .endfor
 .include "../../textproc/icu/buildlink3.mk"
 .include "../../mk/bsd.pkg.mk"

 @@ -1,49 +1,70 @@
-This is a collection of utilities to separate Thai words by spaces
+NAME
-(word tokenization).  They can separate stdin, files, or text as
+     st-emacs
-arguments.  It includes 3 separate utilities:
+     st-icu
      st-swath
 st-emacs:  emacs-script using emacs lisp thai-word library
-           https://www.gnu.org/software/emacs/
+SYNOPSIS
-st-icu:    basic C++ program using the ICU library
+     st-emacs|st-icu|st-swath [filename|text1 text2 ...|'blank']
            http://site.icu-project.org/
-st-swath:  sh script wrapper to simplfy args to the swath program
+DESCRIPTION
-           https://linux.thai.net/projects/swath
+     This package is a collection of utilities to separate Thai words
      by spaces (word tokenization).  They can separate stdin, files,
-All scripts should be able to take a filename, stdin, or arguments as
+     or text as arguments.  It includes 3 separate utilities:
 input, e.g., :
      st-emacs:  emacs-script using emacs lisp thai-word library
                 https://www.gnu.org/software/emacs/
      st-icu:    basic C++ program using the ICU library
                 http://site.icu-project.org/
      st-swath:  sh script wrapper to simplfy args to the swath program
                 https://linux.thai.net/projects/swath
 EXAMPLES
       split one or more text strings
       # st-swath แมวและหมา
 or
       # echo "แมวและหมา" | st-swath
 or
       # st-swath < thaifile.txt
 or
       # st-swath "แมวหมา" พ่อและแม่
-You will most likely need to set LC_ALL or LC_CTYPE to an approriate
+      read stdin
-unicode value, e.g., en_US.UTF-8 or C.UTF-8, in the environment for
+      # echo "แมวและหมา" | st-swath
 them to work properly.  These tools are setup to only support UTF-8
 encodings.
 Note that it is not possible to split Thai words 100% accurately
 without context and meaning.  These programs use dictionary-based word
 splitting.
 Also included in the package is a combined thai word dictionary and
 corresponding .tri file, and emacs lisp .el file for reading and
 dumping out dictionary files.
 st-emacs and st-swath are setup to use the combined dictionary with
 words from the emacs 'thai-word library, swath dictionary words, and
 the icu thai library words.
 st-icu uses its own built in library.  To customise the icu
 dictionary, you apparently would have to modify
   icu4c/source/data/brkitr/dictionaries/thaidict.txt
 and rebuild icu library, and then rebuild the whole thing.
 There is also
-See also swath(1), libthai(1), emacs(1), locale(1), uconv(1), iconv(1)
+      read from a file
       # st-swath < thaifile.txt
       # st-swath somefile.txt
-TODO - fix st-icu to use all the combined dictionary words.
+      They can also read directly from stdin
       # st-icu
         แมวหมา   (typed in)
         แมว หมา  (output line by line)
 ENVIRONMENT
      You will most likely need to set the environment variables LC_ALL
      or LC_CTYPE for proper unicode handling, e.g., en_US.UTF-8 or
      C.UTF-8.  These tools are only setup to handle UTF-8 encodings.
 EXIT STATUS
 for success, non zero otherwise
 NOTES
      Note that it is not possible to split Thai words 100% accurately
      without context and meaning.  All these programs use
      dictionary-based word splitting.
      Also included in the package is a combined thai word dictionary
      and corresponding .tri file, and emacs lisp .el files for reading
      and dumping out dictionary files.
      st-emacs and st-swath are setup to use the combined dictionary
      with words from the emacs 'thai-word library, swath dictionary
      words, and the icu thai library words.
      st-icu uses its own built in library.  To customise the icu
      dictionary, you apparently would have to modify
      icu4c/source/data/brkitr/dictionaries/thaidict.txt and then
      rebuild the whole library.
 SEE ALSO
      swath(1), libthai(1), emacs(1), locale(1), uconv(1), iconv(1)
 BUGS
      st-icu should also use the combined dictionary words.
      st-emacs and st-icu don't always split thai numbers well.
      this file should be converted to a proper manpage.

 @@ -2,27 +2,27 @@
 ;;
 ;; break thai string into words separated by spaces
 ;;
 ;; - if no args, process stdin
 ;; - if one arg and file exists with arg name, process file
 ;; - else join get remainder of args and process
 ;;
 ;;(toggle-debug-on-error) ;; debug
 (require 'thai-word)
 ;; load custom dictionary
 (load "ST_SHARE_DIR/thai-utility" nil t)
-(thai-update-word-table-utf8 "ST_SHARE_DIR/thaidict")
+(load "ST_SHARE_DIR/thai-dict" nil t)
 ;; split a thai line by spaces, return new line
 (defun process-thai-line(line)
   (with-temp-buffer
     (insert line)
     (goto-char (point-min))
     (thai-break-words " ")
     (buffer-string)))
 ;; hack to process stdin
 (defun process-stdin()
   (condition-case nil
       (let (aline)

 @@ -33,28 +33,28 @@ uses recursion"
        ((equal complete 1)
 	(append (list thaistr)
 		(extract-thai-na (cddr nlist) thaistr) '()))
        (t
 	(error "invalid parsing for complete var"))))
      ;; not finished
      (t
       (append (extract-thai-na (car nlist) thaistr)
 	      (extract-thai-na (cdr nlist) thaistr) '())))))
 (defun thai-word-table-save(filename &optional alist)
   "save thai words extracted from a nested-alist table to
-filename in utf8 format.  default is to save 'thai-word-table if
+filename in utf8 format, one word per line.  default is to save
-no alist argument given."
+'thai-word-table if no alist argument given."
   (interactive)
   (let ((thaiwords)
 	(elem)
 	(coding-system-for-read 'utf-8)
 	(coding-system-for-write 'utf-8)
 	(buffer-file-coding-system 'utf-8))
     ;; default list or not
     (setq alist (or alist
 		    thai-word-table))
     (or (nested-alist-p alist)
       (error "Invalid argument %s" alist))
 @@ -85,13 +85,54 @@ is appended instead to the current word
   (interactive "FThai word table file: \nP")
   (let* ((coding-system-for-read 'utf-8)
 	 (coding-system-for-write 'utf-8)
 	 (buffer-file-coding-system 'utf-8)
 	 (temp_file (make-temp-file "thaiutf8_")))
     (unwind-protect
 	(with-temp-buffer
 	  (insert-file-contents file)
 	  (setq coding-system-for-write 'thai-tis620)
 	  (write-file temp_file))
       (thai-update-word-table temp_file append)
       (delete-file temp_file)
       thai-word-table)))
 (defun thai-word-table-save-defvar(dictfile lispfile)
   "read a utf8 thai dictionary file and save to a lisp file
 suitable for initializing the 'thai-word-table as a \"defvar\".
 Overwrites the lisp file if it exists."
   (interactive)
   (let ((header)
 	(footer)
 	(elem)
 	(coding-system-for-read 'utf-8)
 	(coding-system-for-write 'utf-8)
 	(buffer-file-coding-system 'utf-8))
     (setq header (list "(defvar thai-word-table"
 		       "(let ((table (list 'thai-words)))"
 		       "(dolist (elt"
 		       "'(" ))
     (setq footer (list "))"
 		       "(set-nested-alist elt 1 table))"
 		       "table)"
 		       "\"Nested alist of Thai words.\")" ))
     (with-temp-buffer
       (insert-file-contents dictfile)
       (goto-char (point-min))
       ;; quote each thai word
       (while (not (eobp))
 	(beginning-of-line)
 	(insert "\"")
 	(end-of-line)
 	(insert "\"")
 	(forward-line 1))
       (goto-char (point-min))
       (dolist (elem header)
 	(insert elem "\n"))
       (goto-char (point-max))
       (dolist (elem footer)
 	(insert elem "\n"))
       (lisp-mode)
       (indent-region (point-min) (point-max))
       (write-region nil nil lispfile))))

 @@ -1,9 +1,11 @@
-@comment $NetBSD: PLIST,v 1.1 2020/08/13 20:52:09 scole Exp $
+@comment $NetBSD: PLIST,v 1.2 2020/08/14 17:31:34 scole Exp $
 bin/st-emacs
 bin/st-icu
 bin/st-swath
 share/split-thai/README.txt
 share/split-thai/thai-dict.el
 share/split-thai/thai-dict.elc
 share/split-thai/thai-utility.el
 share/split-thai/thai-utility.elc
 share/split-thai/thaidict
 share/split-thai/thaidict.tri