Received: by mail.netbsd.org (Postfix, from userid 605) id 903C684D60; Fri, 28 Aug 2020 16:02:44 +0000 (UTC) Received: from localhost (localhost [127.0.0.1]) by mail.netbsd.org (Postfix) with ESMTP id 183ED84D22 for ; Fri, 28 Aug 2020 16:02:44 +0000 (UTC) X-Virus-Scanned: amavisd-new at netbsd.org Received: from mail.netbsd.org ([IPv6:::1]) by localhost (mail.netbsd.org [IPv6:::1]) (amavisd-new, port 10025) with ESMTP id X8scfLMn9z5x for ; Fri, 28 Aug 2020 16:02:43 +0000 (UTC) Received: from cvs.NetBSD.org (ivanova.netbsd.org [199.233.217.197]) by mail.netbsd.org (Postfix) with ESMTP id 1AA2C84CF1 for ; Fri, 28 Aug 2020 16:02:43 +0000 (UTC) Received: by cvs.NetBSD.org (Postfix, from userid 500) id 13958FB27; Fri, 28 Aug 2020 16:02:43 +0000 (UTC) Content-Transfer-Encoding: 7bit Content-Type: multipart/mixed; boundary="_----------=_159863056340790" MIME-Version: 1.0 Date: Fri, 28 Aug 2020 16:02:43 +0000 From: "Sean Cole" Subject: CVS commit: pkgsrc/textproc/split-thai To: pkgsrc-changes@NetBSD.org Reply-To: scole@netbsd.org X-Mailer: log_accum Message-Id: <20200828160243.13958FB27@cvs.NetBSD.org> Sender: pkgsrc-changes-owner@NetBSD.org List-Id: pkgsrc-changes.NetBSD.org Precedence: bulk List-Unsubscribe: This is a multi-part message in MIME format. --_----------=_159863056340790 Content-Disposition: inline Content-Transfer-Encoding: 8bit Content-Type: text/plain; charset="US-ASCII" Module Name: pkgsrc Committed By: scole Date: Fri Aug 28 16:02:42 UTC 2020 Modified Files: pkgsrc/textproc/split-thai: DESCR Makefile PLIST pkgsrc/textproc/split-thai/files: README.txt Added Files: pkgsrc/textproc/split-thai/files: tgrep Log Message: Update to 0.8 - add 'tgrep' perl script for grepping thai words To generate a diff of this commit: cvs rdiff -u -r1.1 -r1.2 pkgsrc/textproc/split-thai/DESCR cvs rdiff -u -r1.7 -r1.8 pkgsrc/textproc/split-thai/Makefile cvs rdiff -u -r1.2 -r1.3 pkgsrc/textproc/split-thai/PLIST cvs rdiff -u -r1.3 -r1.4 pkgsrc/textproc/split-thai/files/README.txt cvs rdiff -u -r0 -r1.1 pkgsrc/textproc/split-thai/files/tgrep Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. --_----------=_159863056340790 Content-Disposition: inline Content-Length: 11836 Content-Transfer-Encoding: binary Content-Type: text/x-diff; charset=utf-8 Modified files: Index: pkgsrc/textproc/split-thai/DESCR diff -u pkgsrc/textproc/split-thai/DESCR:1.1 pkgsrc/textproc/split-thai/DESCR:1.2 --- pkgsrc/textproc/split-thai/DESCR:1.1 Thu Aug 13 20:52:09 2020 +++ pkgsrc/textproc/split-thai/DESCR Fri Aug 28 16:02:42 2020 @@ -3,4 +3,5 @@ boundaries, also known as word tokenizat swath, and a c++ icu-project program. All use dictionary-based word splitting. -Also included is merged dictionary file of thai words. +Also included is a merged dictionary file of Thai words and a perl +script to grep Thai UTF-8 words. Index: pkgsrc/textproc/split-thai/Makefile diff -u pkgsrc/textproc/split-thai/Makefile:1.7 pkgsrc/textproc/split-thai/Makefile:1.8 --- pkgsrc/textproc/split-thai/Makefile:1.7 Thu Aug 20 14:20:27 2020 +++ pkgsrc/textproc/split-thai/Makefile Fri Aug 28 16:02:42 2020 @@ -1,6 +1,6 @@ -# $NetBSD: Makefile,v 1.7 2020/08/20 14:20:27 scole Exp $ +# $NetBSD: Makefile,v 1.8 2020/08/28 16:02:42 scole Exp $ -PKGNAME= split-thai-0.7 +PKGNAME= split-thai-0.8 CATEGORIES= textproc MAINTAINER= pkgsrc-users@NetBSD.org COMMENT= Utilities to split UTF-8 Thai text into words @@ -15,10 +15,12 @@ MASTER_SITES= -${MASTER_SITE_GITHUB:=uni USE_LANGUAGES= c++11 # darwin needed 11? USE_TOOLS= pkg-config mkdir cp sh:run env awk cat sort uniq grep wc echo +USE_TOOLS+= perl:run BUILD_DEPENDS+= libdatrie-[0-9]*:../../devel/libdatrie DEPENDS+= emacs-[0-9]*:../../editors/emacs DEPENDS+= swath-[0-9]*:../../textproc/swath +REPLACE_PERL= tgrep REPLACE_SH= st-swath UTF8_ENV= env LC_ALL=C.UTF-8 @@ -47,7 +49,7 @@ SUBST_SED.dictionary-app+= -e 's,ST_SHAR pre-extract: mkdir -p ${WRKSRC} cd files && cp README.txt st-emacs st-icu.cc st-swath \ - thai-utility.el thaidict.abm ${WRKSRC} + tgrep thai-utility.el thaidict.abm ${WRKSRC} post-extract: cd ${WRKSRC} && ${UTF8_ENV} emacs --batch \ @@ -80,7 +82,7 @@ do-build: do-install: ${INSTALL_SCRIPT} ${WRKSRC}/st-emacs ${WRKSRC}/st-swath \ - ${DESTDIR}${PREFIX}/bin + ${WRKSRC}/tgrep ${DESTDIR}${PREFIX}/bin ${INSTALL_PROGRAM} ${WRKSRC}/st-icu ${DESTDIR}${PREFIX}/bin .for i in ${ST_SHARE_FILES} ${INSTALL_DATA} ${WRKSRC}/${i} ${DESTDIR}${PREFIX}/share/split-thai Index: pkgsrc/textproc/split-thai/PLIST diff -u pkgsrc/textproc/split-thai/PLIST:1.2 pkgsrc/textproc/split-thai/PLIST:1.3 --- pkgsrc/textproc/split-thai/PLIST:1.2 Fri Aug 14 17:31:34 2020 +++ pkgsrc/textproc/split-thai/PLIST Fri Aug 28 16:02:42 2020 @@ -1,7 +1,8 @@ -@comment $NetBSD: PLIST,v 1.2 2020/08/14 17:31:34 scole Exp $ +@comment $NetBSD: PLIST,v 1.3 2020/08/28 16:02:42 scole Exp $ bin/st-emacs bin/st-icu bin/st-swath +bin/tgrep share/split-thai/README.txt share/split-thai/thai-dict.el share/split-thai/thai-dict.elc Index: pkgsrc/textproc/split-thai/files/README.txt diff -u pkgsrc/textproc/split-thai/files/README.txt:1.3 pkgsrc/textproc/split-thai/files/README.txt:1.4 --- pkgsrc/textproc/split-thai/files/README.txt:1.3 Mon Aug 17 17:43:15 2020 +++ pkgsrc/textproc/split-thai/files/README.txt Fri Aug 28 16:02:42 2020 @@ -2,14 +2,16 @@ NAME st-emacs st-icu st-swath + tgrep SYNOPSIS st-emacs|st-icu|st-swath [filename|text1 text2 ...|'blank'] + tgrep [options] FILE ... DESCRIPTION This package is a collection of utilities to separate Thai words by spaces (word tokenization). They can separate stdin, files, - or text as arguments. It includes 3 separate utilities: + or text as arguments. It includes these utilities: st-emacs: emacs-script using emacs lisp thai-word library https://www.gnu.org/software/emacs/ @@ -18,30 +20,38 @@ DESCRIPTION st-swath: sh script wrapper to simplfy args to the swath program https://linux.thai.net/projects/swath + tgrep: grep-like utility using perl, see "tgrep -h" + EXAMPLES - split one or more text strings + split one or more text strings: # st-swath แมวและหมา # st-swath "แมวหมา" พ่อและแม่ - read stdin + read stdin: # echo "แมวและหมา" | st-swath - read from a file + read from a file: # st-swath < thaifile.txt # st-swath somefile.txt - They can also read directly from stdin + They can also read directly from stdin: # st-icu แมวหมา (typed in) แมว หมา (output line by line) + grep for thai words: + # grep แมว thaifile.txt + ENVIRONMENT You will most likely need to set the environment variables LC_ALL or LC_CTYPE for proper unicode handling, e.g., en_US.UTF-8 or C.UTF-8. These tools are only setup to handle UTF-8 encodings. + A terminal capable of entering and displaying UTF-8, and some + actual UTF-8 fonts installed on the system will also be needed. + EXIT STATUS - 0 for success, non zero otherwise + 0 for success, non zero otherwise. For tgrep, see "tgrep -h" NOTES Note that it is not possible to split Thai words 100% accurately @@ -66,5 +76,6 @@ SEE ALSO BUGS st-icu should also use the combined dictionary words. - thai text mixed with other languages may not be handled well. - this file should be converted to a proper manpage. + thai text mixed with other languages may not be handled well when + splitting. + this file should be converted to proper manpages. Added files: Index: pkgsrc/textproc/split-thai/files/tgrep diff -u /dev/null pkgsrc/textproc/split-thai/files/tgrep:1.1 --- /dev/null Fri Aug 28 16:02:43 2020 +++ pkgsrc/textproc/split-thai/files/tgrep Fri Aug 28 16:02:42 2020 @@ -0,0 +1,208 @@ +#!/bin/perl +# +# perl grep equivalent-wrapper supporting utf-8 and thai in particular +# +use warnings; +use strict; +use Encode; +use Getopt::Std; + +use utf8; +use open qw/:std :utf8/; + +our ( $opt_h, $opt_i, $opt_l, $opt_n, $opt_q, $opt_v ); + +getopts('hilnqv'); + +if ( $opt_h ) { + usage(); + exit 0; +} elsif ( ! defined $ARGV[0] ) { + # no pattern given + usage(); + exit 1; +} + +my $pattern = decode('UTF-8', $ARGV[0]) if defined $ARGV[0]; +unless ( length( $pattern ) ) { + usage(); + exit 1; +} + +my $opt_filesonly = ( defined $opt_l ? 1 : 0 ); +my $opt_ignorecase = ( defined $opt_i ? 1 : 0 ); +my $opt_linenum = ( defined $opt_n ? 1 : 0 ); +my $opt_quiet = ( defined $opt_q ? 1 : 0 ); +my $opt_invert = ( defined $opt_v ? 1 : 0 ); + +# rest of args should be filenames +my @files = @ARGV; +shift @files; +@files = map { decode('UTF-8', $_ ) } @files; + +# +# usage +# +sub usage { + print <<'EOF'; + +NAME + tgrep - print lines matching a pattern, supports utf-8 characters + and some thai character classes using perl regexp matching. + +SYNOPSIS + tgrep [options] PATTERN [FILE] [FILE2] + +DESCRIPTION + tgrep (thai grep) is similar to grep, in that it searches files or + stdin for lines matching a pattern. It uses perl to support utf-8 + characters, and therefore the patterns are perl regexp patterns. + It supports a few simple homegrown character classes: + + [:thai:] match any thai unicode value + [:thaiconsonant:] match thai consonant including ฤ ฦ + [:thaidigit:] match thai number ๐๑๒๓๔๕๖๗๘๙ + [:thaitonemark:] match thai tonemark ่้๊๋ + [:thaivowel:] match thai vowel symbols ะัา ำิีึืุูเแโใไๅ็ + does not include consonants that function as vowels + [:thaifullvowel:] same as [:thaivowel:] plus อรวยฤฦๅ used to form + vowel diacritics and dipthongs + [:thaimisc:] match misc thai symbols ฯๆฺ฿์ํ๎๏๚๛ + +OPTIONS + -h print help or usage + + -i ignore case + + -l suppress normal output, only print filenames that match + + -n prefix each line of output with the line number of the file + + -q quiet mode, don't print out matches + + -v invert match or print lines not matching pattern + +ENVIRONMENT + You may need to set LC_CTYPE, LC_ALL, or other LC_* to a utf-8 + setting for this to program to work, e.g. for csh-type shells: + setenv LC_CTYPE en_US.UTF-8 + +EXIT STATUS + Similar to grep, returns 0 when matching line found, 1 otherwise. + If an error occurs, exit with 2 unless -q (quiet) option and a + match is found + +EXAMPLES + search for 'ก' in a utf-8 text file + $ tgrep ก file.txt + + use perl regexp to match any line thai with utf-8 characters + $ tgrep '\p{InThai}' somefile.txt + + use perl regexp unicode values to match thai numbers + $ tgrep '^[\x{0e50}-\x{0e59}]+$' other.file + + match lines with a thai number + $ tgrep '[:thaidigit:]' afile.txt + +NOTES + grep(1) also can be used to match thai characters with unicode + escapes, for example + egrep "["$'\u0e01'-$'\u0e5b'"]" file.txt + would match thai unicode chars in a bash-type shell. + +SEE ALSO + grep(1), perl(1), perlre(1), locale(1), ugrep(1) + +BUGS + Only utf-8 encodings are supported. + The character classes used by this program ([:thai*:]) are not + standard or supported by other programs. + Quoting perl regular expression can sometimes be difficult from + within the shell. + +EOF +} + +# handle convenience character classes +if ( index($pattern, "[:thai:]") != -1 ) { + $pattern =~ s!\[\:thai\:\]!\\p\{InThai\}!g; +} +if ( index($pattern, "[:thaiconsonant:]") != -1 ) { + # chars between ก & ฮ inclusive + $pattern =~ s!\[\:thaiconsonant\:\]!\[\x{0e01}-\x{0e2e}\]!g; +} +if ( index($pattern, "[:thaidigit:]") != -1 ) { + $pattern =~ s!\[\:thaidigit\:\]![๐๑๒๓๔๕๖๗๘๙]!g; +} +if ( index($pattern, "[:thaitonemark:]") != -1 ) { + $pattern =~ s!\[\:thaitonemark\:\]![่้๊๋]!g; +} +if ( index($pattern, "[:thaivowel:]") != -1 ) { + $pattern =~ s!\[\:thaivowel\:\]![ะัา ำิีึืุูเแโใไๅ็]!g; +} +if ( index($pattern, "[:thaivowelfull:]") != -1 ) { + $pattern =~ s!\[\:thaivowelfull\:\]![ะัา ำิีึืุูเแโใไๅ็อรวยฤฦๅ]!g; +} +if ( index($pattern, "[:thaimisc:]") != -1 ) { + $pattern =~ s!\[\:thaimisc\:\]![ฯๆ฿์ํ๎๏ฺ๚๛]!g; +} + +my $qpattern = ( $opt_ignorecase ? qr/$pattern/iou : qr/$pattern/ou ); +#print "pattern \"$pattern\"\n"; +#print "qpattern \"$qpattern\"\n"; + +# if no file args or just "-", assume stdin +push @files, "/dev/stdin" if ! @files; +@files = map { $_ eq "-" ? "/dev/stdin" : $_ } @files; + +# maybe help to improve matching speed +my $not_invert = ! $opt_invert; + +my $match_found = 0; +my $error_occurred = 0; +foreach my $file ( @files ) { + my $info; + unless ( open $info, $file ) { + warn "Could not open $file: $!"; + $error_occurred = 1; + next; + } + + my $line_num = 1; + my $print_filename = ( scalar ( @files ) > 1 ) && $file ne "/dev/stdin"; + my $print_linenum = $opt_linenum && $file ne "/dev/stdin"; + + while( my $line = <$info> ) { + if ( ( $not_invert && $line =~ m/$qpattern/ ) || + ( $opt_invert && $line !~ m/$qpattern/ ) ) { + $match_found = 1; + if ( $opt_quiet ) { + last; + } elsif ( $opt_filesonly ) { + print $file, "\n"; + last; + } + print $file,":" if $print_filename; + print $line_num,":" if $print_linenum; + chomp($line); + print $line, "\n"; + } + $line_num += 1; + } + unless ( close $info ) { + warn "Could not close $file: $!"; + $error_occurred = 1; + } +} + +# exit with same error codes as grep +if ( $error_occurred ) { + if ( $match_found && $opt_quiet ) { + exit 0; + } else { + exit 2; + } +} else { + exit ( $match_found ? 0 : 1 ); +} --_----------=_159863056340790--