Fri Aug 14 17:31:35 2020 UTC ()
Update to version 0.2

- generate the emacs dictionary once at build time, not every time the
  program is run
- clean up the README


(scole)
diff -r1.1 -r1.2 pkgsrc/textproc/split-thai/Makefile
diff -r1.1 -r1.2 pkgsrc/textproc/split-thai/PLIST
diff -r1.1 -r1.2 pkgsrc/textproc/split-thai/files/README.txt
diff -r1.1 -r1.2 pkgsrc/textproc/split-thai/files/st-emacs
diff -r1.1 -r1.2 pkgsrc/textproc/split-thai/files/thai-utility.el

cvs diff -r1.1 -r1.2 pkgsrc/textproc/split-thai/Makefile (expand / switch to unified diff)

--- pkgsrc/textproc/split-thai/Makefile 2020/08/13 20:52:09 1.1
+++ pkgsrc/textproc/split-thai/Makefile 2020/08/14 17:31:34 1.2
@@ -1,81 +1,88 @@ @@ -1,81 +1,88 @@
1# $NetBSD: Makefile,v 1.1 2020/08/13 20:52:09 scole Exp $ 1# $NetBSD: Makefile,v 1.2 2020/08/14 17:31:34 scole Exp $
2 2
3PKGNAME= split-thai-0.1 3PKGNAME= split-thai-0.2
4CATEGORIES= textproc 4CATEGORIES= textproc
5MAINTAINER= pkgsrc-users@NetBSD.org 5MAINTAINER= pkgsrc-users@NetBSD.org
6COMMENT= Utilities to split UTF-8 Thai text into words 6COMMENT= Utilities to split UTF-8 Thai text into words
7LICENSE= public-domain AND mit AND gnu-gpl-v2 # code, icu dict, swath dict 7LICENSE= public-domain AND mit AND gnu-gpl-v2 # code, icu dict, swath dict
8 8
9# xxx fetching a specific version of a file out of a github project 9# xxx fetching a specific version of a file out of a github project
10EXTRACT_SUFX= # none 10EXTRACT_SUFX= # none
11GITHUB_ICU_TAG= 61607c27732906d36c5bd4d23ecc092f89f53a2b 11GITHUB_ICU_TAG= 61607c27732906d36c5bd4d23ecc092f89f53a2b
12DISTFILES= thaidict-${GITHUB_ICU_TAG}.txt 12DISTFILES= thaidict-${GITHUB_ICU_TAG}.txt
13MASTER_SITES= -${MASTER_SITE_GITHUB:=unicode-org/}/icu/raw/${GITHUB_ICU_TAG}/icu4c/source/data/brkitr/dictionaries/thaidict.txt 13MASTER_SITES= -${MASTER_SITE_GITHUB:=unicode-org/}/icu/raw/${GITHUB_ICU_TAG}/icu4c/source/data/brkitr/dictionaries/thaidict.txt
14 14
15USE_LANGUAGES= c++11 # darwin needed 11? 15USE_LANGUAGES= c++11 # darwin needed 11?
16 16
17USE_TOOLS= pkg-config mkdir cp sh:run env awk cat sort uniq grep wc echo 17USE_TOOLS= pkg-config mkdir cp sh:run env awk cat sort uniq grep wc echo
18BUILD_DEPENDS+= libdatrie-[0-9]*:../../devel/libdatrie 18BUILD_DEPENDS+= libdatrie-[0-9]*:../../devel/libdatrie
19DEPENDS+= emacs-[0-9]*:../../editors/emacs 19DEPENDS+= emacs-[0-9]*:../../editors/emacs
20DEPENDS+= swath-[0-9]*:../../textproc/swath 20DEPENDS+= swath-[0-9]*:../../textproc/swath
21 21
22REPLACE_SH= st-swath 22REPLACE_SH= st-swath
23 23
24UTF8_ENV= env LC_ALL=C.UTF-8 24UTF8_ENV= env LC_ALL=C.UTF-8
25 25
26ST_SHARE_DIR= share/split-thai 26ST_SHARE_DIR= share/split-thai
27INSTALLATION_DIRS= bin ${ST_SHARE_DIR} 27INSTALLATION_DIRS= bin ${ST_SHARE_DIR}
28 28
 29ST_SHARE_FILES= README.txt thaidict thai-dict.el thai-dict.elc
 30ST_SHARE_FILES+= thai-utility.el thai-utility.elc thaidict.tri
 31
29# xxx REPLACE_EMACS_SCRIPT 32# xxx REPLACE_EMACS_SCRIPT
30SUBST_CLASSES+= st-emacs-app 33SUBST_CLASSES+= st-emacs-app
31SUBST_STAGE.st-emacs-app= pre-configure 34SUBST_STAGE.st-emacs-app= pre-configure
32SUBST_MESSAGE.st-emacs-app= Fixing emacs script paths. 35SUBST_MESSAGE.st-emacs-app= Fixing emacs script paths.
33SUBST_FILES.st-emacs-app= st-emacs 36SUBST_FILES.st-emacs-app= st-emacs
34SUBST_SED.st-emacs-app= -e 's,!/bin/emacs,!${PREFIX}/bin/emacs,g' 37SUBST_SED.st-emacs-app= -e 's,!/bin/emacs,!${PREFIX}/bin/emacs,g'
35 38
36SUBST_CLASSES+= dictionary-app 39SUBST_CLASSES+= dictionary-app
37SUBST_STAGE.dictionary-app= pre-configure 40SUBST_STAGE.dictionary-app= pre-configure
38SUBST_MESSAGE.dictionary-app= Fixing dictionary paths. 41SUBST_MESSAGE.dictionary-app= Fixing dictionary paths.
39SUBST_FILES.dictionary-app= st-emacs st-swath 42SUBST_FILES.dictionary-app= st-emacs st-swath
40SUBST_SED.dictionary-app= -e 's,ST_SHARE_DIR,${PREFIX}/${ST_SHARE_DIR},g' 43SUBST_SED.dictionary-app= -e 's,ST_SHARE_DIR,${PREFIX}/${ST_SHARE_DIR},g'
41 44
42pre-extract: 45pre-extract:
43 mkdir -p ${WRKSRC} 46 mkdir -p ${WRKSRC}
44 cd files && cp README.txt st-emacs st-icu.cc st-swath \ 47 cd files && cp README.txt st-emacs st-icu.cc st-swath \
45 thai-utility.el thaidict.abm ${WRKSRC} 48 thai-utility.el thaidict.abm ${WRKSRC}
46 49
47post-extract: 50post-extract:
48 cd ${WRKSRC} && ${UTF8_ENV} emacs --batch \ 51 cd ${WRKSRC} && ${UTF8_ENV} emacs --batch \
49 -f batch-byte-compile thai-utility.el 52 -f batch-byte-compile thai-utility.el
50 cd ${WRKSRC} && ${UTF8_ENV} emacs --batch -l thai-utility.el \ 53 cd ${WRKSRC} && ${UTF8_ENV} emacs --batch -l thai-utility.elc \
51 --eval '(thai-word-table-save "emacs-dict")' 54 --eval '(thai-word-table-save "emacs-dict")'
52 cp ${WRKDIR}/${DISTFILES} ${WRKSRC}/icu-dict 55 cp ${WRKDIR}/${DISTFILES} ${WRKSRC}/icu-dict
53 cd ${PREFIX}/share/swath && \ 56 cd ${PREFIX}/share/swath && \
54 ${UTF8_ENV} trietool swathdic list | \ 57 ${UTF8_ENV} trietool swathdic list | \
55 awk '{print $$1}' > ${WRKSRC}/swath-dict 58 awk '{print $$1}' > ${WRKSRC}/swath-dict
56 cd ${WRKSRC} && \ 59 cd ${WRKSRC} && \
57 ${UTF8_ENV} cat icu-dict swath-dict emacs-dict | \ 60 ${UTF8_ENV} cat icu-dict swath-dict emacs-dict | \
58 grep -v '#' | sort | uniq > thaidict 61 grep -v '#' | sort | uniq > thaidict
59 cd ${WRKSRC} && \ 62 cd ${WRKSRC} && \
60 ${UTF8_ENV} trietool thaidict add-list -e utf-8 thaidict 63 ${UTF8_ENV} trietool thaidict add-list -e utf-8 thaidict
 64 cd ${WRKSRC} && ${UTF8_ENV} emacs --batch -l thai-utility.elc \
 65 --eval '(thai-word-table-save-defvar "thaidict" "thai-dict.el")'
 66 cd ${WRKSRC} && ${UTF8_ENV} emacs --batch \
 67 -f batch-byte-compile thai-dict.el
61.for i in emacs-dict icu-dict swath-dict 68.for i in emacs-dict icu-dict swath-dict
62 @${ECHO} `wc -l ${WRKSRC}/${i} | awk '{print $$1}'` words in ${i} 69 @${ECHO} `wc -l ${WRKSRC}/${i} | awk '{print $$1}'` words in ${i}
63.endfor 70.endfor
64 @${ECHO} `wc -l ${WRKSRC}/thaidict | awk '{print $$1}'` \ 71 @${ECHO} `wc -l ${WRKSRC}/thaidict | awk '{print $$1}'` \
65 unique words in combined dictionary 72 unique words in combined dictionary
66 73
67do-build: 74do-build:
68 cd ${WRKSRC} && \ 75 cd ${WRKSRC} && \
69 ${CXX} ${CPPFLAGS} -o st-icu st-icu.cc \ 76 ${CXX} ${CPPFLAGS} -o st-icu st-icu.cc \
70 `pkg-config --libs --cflags icu-io` 77 `pkg-config --libs --cflags icu-io`
71 78
72do-install: 79do-install:
73 ${INSTALL_SCRIPT} ${WRKSRC}/st-emacs ${WRKSRC}/st-swath \ 80 ${INSTALL_SCRIPT} ${WRKSRC}/st-emacs ${WRKSRC}/st-swath \
74 ${DESTDIR}${PREFIX}/bin 81 ${DESTDIR}${PREFIX}/bin
75 ${INSTALL_PROGRAM} ${WRKSRC}/st-icu ${DESTDIR}${PREFIX}/bin 82 ${INSTALL_PROGRAM} ${WRKSRC}/st-icu ${DESTDIR}${PREFIX}/bin
76.for i in README.txt thaidict thai-utility.el thai-utility.elc thaidict.tri 83.for i in ${ST_SHARE_FILES}
77 ${INSTALL_DATA} ${WRKSRC}/${i} ${DESTDIR}${PREFIX}/share/split-thai 84 ${INSTALL_DATA} ${WRKSRC}/${i} ${DESTDIR}${PREFIX}/share/split-thai
78.endfor 85.endfor
79 86
80.include "../../textproc/icu/buildlink3.mk" 87.include "../../textproc/icu/buildlink3.mk"
81.include "../../mk/bsd.pkg.mk" 88.include "../../mk/bsd.pkg.mk"

cvs diff -r1.1 -r1.2 pkgsrc/textproc/split-thai/PLIST (expand / switch to unified diff)

--- pkgsrc/textproc/split-thai/PLIST 2020/08/13 20:52:09 1.1
+++ pkgsrc/textproc/split-thai/PLIST 2020/08/14 17:31:34 1.2
@@ -1,9 +1,11 @@ @@ -1,9 +1,11 @@
1@comment $NetBSD: PLIST,v 1.1 2020/08/13 20:52:09 scole Exp $ 1@comment $NetBSD: PLIST,v 1.2 2020/08/14 17:31:34 scole Exp $
2bin/st-emacs 2bin/st-emacs
3bin/st-icu 3bin/st-icu
4bin/st-swath 4bin/st-swath
5share/split-thai/README.txt 5share/split-thai/README.txt
 6share/split-thai/thai-dict.el
 7share/split-thai/thai-dict.elc
6share/split-thai/thai-utility.el 8share/split-thai/thai-utility.el
7share/split-thai/thai-utility.elc 9share/split-thai/thai-utility.elc
8share/split-thai/thaidict 10share/split-thai/thaidict
9share/split-thai/thaidict.tri 11share/split-thai/thaidict.tri

cvs diff -r1.1 -r1.2 pkgsrc/textproc/split-thai/files/Attic/README.txt (expand / switch to unified diff)

--- pkgsrc/textproc/split-thai/files/Attic/README.txt 2020/08/13 20:52:09 1.1
+++ pkgsrc/textproc/split-thai/files/Attic/README.txt 2020/08/14 17:31:34 1.2
@@ -1,49 +1,70 @@ @@ -1,49 +1,70 @@
1This is a collection of utilities to separate Thai words by spaces 1NAME
2(word tokenization). They can separate stdin, files, or text as 2 st-emacs
3arguments. It includes 3 separate utilities: 3 st-icu
4 4 st-swath
5st-emacs: emacs-script using emacs lisp thai-word library 5
6 https://www.gnu.org/software/emacs/ 6SYNOPSIS
7st-icu: basic C++ program using the ICU library 7 st-emacs|st-icu|st-swath [filename|text1 text2 ...|'blank']
8 http://site.icu-project.org/ 8
9st-swath: sh script wrapper to simplfy args to the swath program 9DESCRIPTION
10 https://linux.thai.net/projects/swath 10 This package is a collection of utilities to separate Thai words
11 11 by spaces (word tokenization). They can separate stdin, files,
12All scripts should be able to take a filename, stdin, or arguments as 12 or text as arguments. It includes 3 separate utilities:
13input, e.g., : 13
 14 st-emacs: emacs-script using emacs lisp thai-word library
 15 https://www.gnu.org/software/emacs/
 16 st-icu: basic C++ program using the ICU library
 17 http://site.icu-project.org/
 18 st-swath: sh script wrapper to simplfy args to the swath program
 19 https://linux.thai.net/projects/swath
14 20
 21EXAMPLES
 22 split one or more text strings
15 # st-swath แมวและหมา 23 # st-swath แมวและหมา
16or 
17 # echo "แมวและหมา" | st-swath 
18or  
19 # st-swath < thaifile.txt 
20or 
21 # st-swath "แมวหมา" พ่อและแม่ 24 # st-swath "แมวหมา" พ่อและแม่
22  25
23You will most likely need to set LC_ALL or LC_CTYPE to an approriate 26 read stdin
24unicode value, e.g., en_US.UTF-8 or C.UTF-8, in the environment for 27 # echo "แมวและหมา" | st-swath
25them to work properly. These tools are setup to only support UTF-8 
26encodings. 
27 
28Note that it is not possible to split Thai words 100% accurately 
29without context and meaning. These programs use dictionary-based word 
30splitting. 
31 
32Also included in the package is a combined thai word dictionary and 
33corresponding .tri file, and emacs lisp .el file for reading and 
34dumping out dictionary files. 
35 
36st-emacs and st-swath are setup to use the combined dictionary with 
37words from the emacs 'thai-word library, swath dictionary words, and 
38the icu thai library words. 
39 
40st-icu uses its own built in library. To customise the icu 
41dictionary, you apparently would have to modify 
42 icu4c/source/data/brkitr/dictionaries/thaidict.txt 
43and rebuild icu library, and then rebuild the whole thing. 
44 
45There is also  
46 28
47See also swath(1), libthai(1), emacs(1), locale(1), uconv(1), iconv(1) 29 read from a file
 30 # st-swath < thaifile.txt
 31 # st-swath somefile.txt
48 32
49TODO - fix st-icu to use all the combined dictionary words. 33 They can also read directly from stdin
 34 # st-icu
 35 แมวหมา (typed in)
 36 แมว หมา (output line by line)
 37
 38ENVIRONMENT
 39 You will most likely need to set the environment variables LC_ALL
 40 or LC_CTYPE for proper unicode handling, e.g., en_US.UTF-8 or
 41 C.UTF-8. These tools are only setup to handle UTF-8 encodings.
 42
 43EXIT STATUS
 44 0 for success, non zero otherwise
 45
 46NOTES
 47 Note that it is not possible to split Thai words 100% accurately
 48 without context and meaning. All these programs use
 49 dictionary-based word splitting.
 50
 51 Also included in the package is a combined thai word dictionary
 52 and corresponding .tri file, and emacs lisp .el files for reading
 53 and dumping out dictionary files.
 54
 55 st-emacs and st-swath are setup to use the combined dictionary
 56 with words from the emacs 'thai-word library, swath dictionary
 57 words, and the icu thai library words.
 58
 59 st-icu uses its own built in library. To customise the icu
 60 dictionary, you apparently would have to modify
 61 icu4c/source/data/brkitr/dictionaries/thaidict.txt and then
 62 rebuild the whole library.
 63
 64SEE ALSO
 65 swath(1), libthai(1), emacs(1), locale(1), uconv(1), iconv(1)
 66
 67BUGS
 68 st-icu should also use the combined dictionary words.
 69 st-emacs and st-icu don't always split thai numbers well.
 70 this file should be converted to a proper manpage.

cvs diff -r1.1 -r1.2 pkgsrc/textproc/split-thai/files/Attic/st-emacs (expand / switch to unified diff)

--- pkgsrc/textproc/split-thai/files/Attic/st-emacs 2020/08/13 20:52:09 1.1
+++ pkgsrc/textproc/split-thai/files/Attic/st-emacs 2020/08/14 17:31:34 1.2
@@ -2,27 +2,27 @@ @@ -2,27 +2,27 @@
2;; 2;;
3;; break thai string into words separated by spaces 3;; break thai string into words separated by spaces
4;; 4;;
5;; - if no args, process stdin 5;; - if no args, process stdin
6;; - if one arg and file exists with arg name, process file 6;; - if one arg and file exists with arg name, process file
7;; - else join get remainder of args and process 7;; - else join get remainder of args and process
8;; 8;;
9 9
10;;(toggle-debug-on-error) ;; debug 10;;(toggle-debug-on-error) ;; debug
11(require 'thai-word) 11(require 'thai-word)
12 12
13;; load custom dictionary 13;; load custom dictionary
14(load "ST_SHARE_DIR/thai-utility" nil t) 14(load "ST_SHARE_DIR/thai-utility" nil t)
15(thai-update-word-table-utf8 "ST_SHARE_DIR/thaidict") 15(load "ST_SHARE_DIR/thai-dict" nil t)
16 16
17;; split a thai line by spaces, return new line 17;; split a thai line by spaces, return new line
18(defun process-thai-line(line) 18(defun process-thai-line(line)
19 (with-temp-buffer 19 (with-temp-buffer
20 (insert line) 20 (insert line)
21 (goto-char (point-min)) 21 (goto-char (point-min))
22 (thai-break-words " ") 22 (thai-break-words " ")
23 (buffer-string))) 23 (buffer-string)))
24 24
25;; hack to process stdin 25;; hack to process stdin
26(defun process-stdin() 26(defun process-stdin()
27 (condition-case nil 27 (condition-case nil
28 (let (aline) 28 (let (aline)

cvs diff -r1.1 -r1.2 pkgsrc/textproc/split-thai/files/Attic/thai-utility.el (expand / switch to unified diff)

--- pkgsrc/textproc/split-thai/files/Attic/thai-utility.el 2020/08/13 20:52:09 1.1
+++ pkgsrc/textproc/split-thai/files/Attic/thai-utility.el 2020/08/14 17:31:34 1.2
@@ -33,28 +33,28 @@ uses recursion" @@ -33,28 +33,28 @@ uses recursion"
33 ((equal complete 1) 33 ((equal complete 1)
34 (append (list thaistr) 34 (append (list thaistr)
35 (extract-thai-na (cddr nlist) thaistr) '())) 35 (extract-thai-na (cddr nlist) thaistr) '()))
36 (t 36 (t
37 (error "invalid parsing for complete var")))) 37 (error "invalid parsing for complete var"))))
38  38
39 ;; not finished 39 ;; not finished
40 (t 40 (t
41 (append (extract-thai-na (car nlist) thaistr) 41 (append (extract-thai-na (car nlist) thaistr)
42 (extract-thai-na (cdr nlist) thaistr) '()))))) 42 (extract-thai-na (cdr nlist) thaistr) '())))))
43 43
44(defun thai-word-table-save(filename &optional alist) 44(defun thai-word-table-save(filename &optional alist)
45 "save thai words extracted from a nested-alist table to 45 "save thai words extracted from a nested-alist table to
46filename in utf8 format. default is to save 'thai-word-table if 46filename in utf8 format, one word per line. default is to save
47no alist argument given." 47'thai-word-table if no alist argument given."
48 (interactive) 48 (interactive)
49 (let ((thaiwords) 49 (let ((thaiwords)
50 (elem) 50 (elem)
51 (coding-system-for-read 'utf-8) 51 (coding-system-for-read 'utf-8)
52 (coding-system-for-write 'utf-8) 52 (coding-system-for-write 'utf-8)
53 (buffer-file-coding-system 'utf-8)) 53 (buffer-file-coding-system 'utf-8))
54 ;; default list or not 54 ;; default list or not
55 (setq alist (or alist 55 (setq alist (or alist
56 thai-word-table)) 56 thai-word-table))
57 57
58 (or (nested-alist-p alist) 58 (or (nested-alist-p alist)
59 (error "Invalid argument %s" alist)) 59 (error "Invalid argument %s" alist))
60 60
@@ -85,13 +85,54 @@ is appended instead to the current word  @@ -85,13 +85,54 @@ is appended instead to the current word
85 (interactive "FThai word table file: \nP") 85 (interactive "FThai word table file: \nP")
86 (let* ((coding-system-for-read 'utf-8) 86 (let* ((coding-system-for-read 'utf-8)
87 (coding-system-for-write 'utf-8) 87 (coding-system-for-write 'utf-8)
88 (buffer-file-coding-system 'utf-8) 88 (buffer-file-coding-system 'utf-8)
89 (temp_file (make-temp-file "thaiutf8_"))) 89 (temp_file (make-temp-file "thaiutf8_")))
90 (unwind-protect 90 (unwind-protect
91 (with-temp-buffer 91 (with-temp-buffer
92 (insert-file-contents file) 92 (insert-file-contents file)
93 (setq coding-system-for-write 'thai-tis620) 93 (setq coding-system-for-write 'thai-tis620)
94 (write-file temp_file)) 94 (write-file temp_file))
95 (thai-update-word-table temp_file append) 95 (thai-update-word-table temp_file append)
96 (delete-file temp_file) 96 (delete-file temp_file)
97 thai-word-table))) 97 thai-word-table)))
 98
 99(defun thai-word-table-save-defvar(dictfile lispfile)
 100 "read a utf8 thai dictionary file and save to a lisp file
 101suitable for initializing the 'thai-word-table as a \"defvar\".
 102Overwrites the lisp file if it exists."
 103 (interactive)
 104 (let ((header)
 105 (footer)
 106 (elem)
 107 (coding-system-for-read 'utf-8)
 108 (coding-system-for-write 'utf-8)
 109 (buffer-file-coding-system 'utf-8))
 110 (setq header (list "(defvar thai-word-table"
 111 "(let ((table (list 'thai-words)))"
 112 "(dolist (elt"
 113 "'(" ))
 114 (setq footer (list "))"
 115 "(set-nested-alist elt 1 table))"
 116 "table)"
 117 "\"Nested alist of Thai words.\")" ))
 118 (with-temp-buffer
 119 (insert-file-contents dictfile)
 120 (goto-char (point-min))
 121 ;; quote each thai word
 122 (while (not (eobp))
 123 (beginning-of-line)
 124 (insert "\"")
 125 (end-of-line)
 126 (insert "\"")
 127 (forward-line 1))
 128
 129 (goto-char (point-min))
 130 (dolist (elem header)
 131 (insert elem "\n"))
 132
 133 (goto-char (point-max))
 134 (dolist (elem footer)
 135 (insert elem "\n"))
 136 (lisp-mode)
 137 (indent-region (point-min) (point-max))
 138 (write-region nil nil lispfile))))