Received: by mail.netbsd.org (Postfix, from userid 605) id 7433A84D22; Sat, 15 Aug 2020 16:52:30 +0000 (UTC) Received: from localhost (localhost [127.0.0.1]) by mail.netbsd.org (Postfix) with ESMTP id EDC4084CF1 for ; Sat, 15 Aug 2020 16:52:29 +0000 (UTC) X-Virus-Scanned: amavisd-new at netbsd.org Received: from mail.netbsd.org ([127.0.0.1]) by localhost (mail.netbsd.org [127.0.0.1]) (amavisd-new, port 10025) with ESMTP id WUdr7nYgmW0P for ; Sat, 15 Aug 2020 16:52:29 +0000 (UTC) Received: from cvs.NetBSD.org (ivanova.netbsd.org [199.233.217.197]) by mail.netbsd.org (Postfix) with ESMTP id 4195F84CEE for ; Sat, 15 Aug 2020 16:52:29 +0000 (UTC) Received: by cvs.NetBSD.org (Postfix, from userid 500) id 3AAF8FB28; Sat, 15 Aug 2020 16:52:29 +0000 (UTC) Content-Transfer-Encoding: 7bit Content-Type: multipart/mixed; boundary="_----------=_1597510349104410" MIME-Version: 1.0 Date: Sat, 15 Aug 2020 16:52:29 +0000 From: "Sean Cole" Subject: CVS commit: pkgsrc/textproc/split-thai To: pkgsrc-changes@NetBSD.org Reply-To: scole@netbsd.org X-Mailer: log_accum Message-Id: <20200815165229.3AAF8FB28@cvs.NetBSD.org> Sender: pkgsrc-changes-owner@NetBSD.org List-Id: pkgsrc-changes.NetBSD.org Precedence: bulk List-Unsubscribe: This is a multi-part message in MIME format. --_----------=_1597510349104410 Content-Disposition: inline Content-Transfer-Encoding: 8bit Content-Type: text/plain; charset="US-ASCII" Module Name: pkgsrc Committed By: scole Date: Sat Aug 15 16:52:29 UTC 2020 Modified Files: pkgsrc/textproc/split-thai: Makefile pkgsrc/textproc/split-thai/files: st-emacs thai-utility.el Log Message: Update to version 0.3 all changes for emacs splitter: - load custom dictionary first because 'thai-word-table is a defvar - add count function and return word counts for a few funcs - add lisp wrappers functions split-thai, split-thai-line which can split thai text in an emacs buffer using 'thai-break-words To generate a diff of this commit: cvs rdiff -u -r1.2 -r1.3 pkgsrc/textproc/split-thai/Makefile cvs rdiff -u -r1.2 -r1.3 pkgsrc/textproc/split-thai/files/st-emacs \ pkgsrc/textproc/split-thai/files/thai-utility.el Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. --_----------=_1597510349104410 Content-Disposition: inline Content-Length: 5346 Content-Transfer-Encoding: binary Content-Type: text/x-diff; charset=us-ascii Modified files: Index: pkgsrc/textproc/split-thai/Makefile diff -u pkgsrc/textproc/split-thai/Makefile:1.2 pkgsrc/textproc/split-thai/Makefile:1.3 --- pkgsrc/textproc/split-thai/Makefile:1.2 Fri Aug 14 17:31:34 2020 +++ pkgsrc/textproc/split-thai/Makefile Sat Aug 15 16:52:28 2020 @@ -1,6 +1,6 @@ -# $NetBSD: Makefile,v 1.2 2020/08/14 17:31:34 scole Exp $ +# $NetBSD: Makefile,v 1.3 2020/08/15 16:52:28 scole Exp $ -PKGNAME= split-thai-0.2 +PKGNAME= split-thai-0.3 CATEGORIES= textproc MAINTAINER= pkgsrc-users@NetBSD.org COMMENT= Utilities to split UTF-8 Thai text into words Index: pkgsrc/textproc/split-thai/files/st-emacs diff -u pkgsrc/textproc/split-thai/files/st-emacs:1.2 pkgsrc/textproc/split-thai/files/st-emacs:1.3 --- pkgsrc/textproc/split-thai/files/st-emacs:1.2 Fri Aug 14 17:31:34 2020 +++ pkgsrc/textproc/split-thai/files/st-emacs Sat Aug 15 16:52:29 2020 @@ -8,11 +8,10 @@ ;; ;;(toggle-debug-on-error) ;; debug -(require 'thai-word) -;; load custom dictionary -(load "ST_SHARE_DIR/thai-utility" nil t) +;; load custom dictionary first, 'thai-word-table is defvar (load "ST_SHARE_DIR/thai-dict" nil t) +(load "ST_SHARE_DIR/thai-utility" nil t) ;; split a thai line by spaces, return new line (defun process-thai-line(line) Index: pkgsrc/textproc/split-thai/files/thai-utility.el diff -u pkgsrc/textproc/split-thai/files/thai-utility.el:1.2 pkgsrc/textproc/split-thai/files/thai-utility.el:1.3 --- pkgsrc/textproc/split-thai/files/thai-utility.el:1.2 Fri Aug 14 17:31:34 2020 +++ pkgsrc/textproc/split-thai/files/thai-utility.el Sat Aug 15 16:52:29 2020 @@ -44,10 +44,12 @@ uses recursion" (defun thai-word-table-save(filename &optional alist) "save thai words extracted from a nested-alist table to filename in utf8 format, one word per line. default is to save -'thai-word-table if no alist argument given." - (interactive) +'thai-word-table if no alist argument given. Returns number of +dictionary words." + (interactive "FName of file to save to: \nP") (let ((thaiwords) (elem) + (line_count) (coding-system-for-read 'utf-8) (coding-system-for-write 'utf-8) (buffer-file-coding-system 'utf-8)) @@ -72,8 +74,29 @@ filename in utf8 format, one word per li (insert elem "\n"))) (sort-lines nil (point-min) (point-max)) + (setq line_count (count-lines (point-min) (point-max))) (write-region nil nil filename) - (buffer-string)))) + line_count))) + +(defun count-words-nested-alist (&optional alist) + "Count number of words in a nested alist. if no arg given, +count 'thai-word-table words" + (interactive) + (let ((count 0) + (elem) + (thaiwords)) + ;; default list or not + (setq alist (or alist thai-word-table)) + (or (nested-alist-p alist) + (error "Invalid argument %s" alist)) + ;; remove 'thai-words from 'thai-word-table + (setq alist (cdr alist)) + (while (setq elem (car alist)) + (setq alist (cdr alist)) + (setq thaiwords (extract-thai-na elem "")) + (setq count (+ count (length thaiwords)))) + (message "%d words in nested alist" count) + count)) ;; 'thai-tis620 is default for emacs <= 28 (defun thai-update-word-table-utf8 (file &optional append) @@ -99,25 +122,32 @@ is appended instead to the current word (defun thai-word-table-save-defvar(dictfile lispfile) "read a utf8 thai dictionary file and save to a lisp file suitable for initializing the 'thai-word-table as a \"defvar\". -Overwrites the lisp file if it exists." +Overwrites the lisp file if it exists. Returns count of +dictionary words." (interactive) (let ((header) (footer) (elem) + (line_count) (coding-system-for-read 'utf-8) (coding-system-for-write 'utf-8) (buffer-file-coding-system 'utf-8)) - (setq header (list "(defvar thai-word-table" - "(let ((table (list 'thai-words)))" - "(dolist (elt" - "'(" )) - (setq footer (list "))" - "(set-nested-alist elt 1 table))" - "table)" - "\"Nested alist of Thai words.\")" )) + (setq header (list + ";; file auto-generated from thai-word-table-save-defvar" + "" + "(defvar thai-word-table" + "(let ((table (list 'thai-words)))" + "(dolist (elt" + "'(" )) + (setq footer (list + "))" + "(set-nested-alist elt 1 table))" + "table)" + "\"Nested alist of Thai words.\")" )) (with-temp-buffer (insert-file-contents dictfile) (goto-char (point-min)) + (setq line_count (count-lines (point-min) (point-max))) ;; quote each thai word (while (not (eobp)) (beginning-of-line) @@ -135,4 +165,18 @@ Overwrites the lisp file if it exists." (insert elem "\n")) (lisp-mode) (indent-region (point-min) (point-max)) - (write-region nil nil lispfile)))) + (write-region nil nil lispfile)) + line_count)) + +(defun split-thai-line(&optional separator) + "Break Thai words from point to end of line by inserting a +separator string at word boundaries. (wrapper for 'thai-break-words)" + (interactive) + (thai-break-words (or separator " ") (line-end-position))) + +(defun split-thai(&optional separator) + "Break Thai words from point to end of buffer by inserting a +separator string at word boundaries. (wrapper for +'thai-break-words)" + (interactive) + (thai-break-words (or separator " ") (point-max))) --_----------=_1597510349104410--