Mon Dec 2 02:00:42 2019 UTC ()
textproc/word2vec: Import version 0.1c

word2vec is an implementation of the Continuous Bag-of-Words (CBOW)
and the Skip-gram model (SG), as well as several demo scripts.  Given
a text corpus, the word2vec tool learns a vector for every word in the
vocabulary using the Continuous Bag-of-Words or the Skip-Gram neural
network architectures.


(minskim)
diff -r0 -r1.1 pkgsrc/textproc/word2vec/DESCR
diff -r0 -r1.1 pkgsrc/textproc/word2vec/Makefile
diff -r0 -r1.1 pkgsrc/textproc/word2vec/PLIST
diff -r0 -r1.1 pkgsrc/textproc/word2vec/distinfo
diff -r0 -r1.1 pkgsrc/textproc/word2vec/patches/patch-makefile
diff -r0 -r1.1 pkgsrc/textproc/word2vec/patches/patch-word2phrase.c
diff -r0 -r1.1 pkgsrc/textproc/word2vec/patches/patch-word2vec.c

File Added: pkgsrc/textproc/word2vec/DESCR
word2vec is an implementation of the Continuous Bag-of-Words (CBOW)
and the Skip-gram model (SG), as well as several demo scripts.  Given
a text corpus, the word2vec tool learns a vector for every word in the
vocabulary using the Continuous Bag-of-Words or the Skip-Gram neural
network architectures.

File Added: pkgsrc/textproc/word2vec/Makefile
# $NetBSD: Makefile,v 1.1 2019/12/02 02:00:41 minskim Exp $

DISTNAME=	word2vec-0.1c
CATEGORIES=	textproc
MASTER_SITES=	${MASTER_SITE_GITHUB:=tmikolov/}
GITHUB_TAG=	20c129af10659f7c50e86e3be406df663beff438

MAINTAINER=	minskim@NetBSD.org
HOMEPAGE=	https://github.com/tmikolov/word2vec
COMMENT=	Tools for computing distributed representtion of words
LICENSE=	apache-2.0

NO_CONFIGURE=	yes

INSTALLATION_DIRS+=	bin

do-install:
.for cmd in compute-accuracy distance word2phrase word2vec word-analogy
	${INSTALL_PROGRAM} ${WRKSRC}/${cmd} ${DESTDIR}${PREFIX}/bin
.endfor

.include "../../mk/bsd.pkg.mk"

File Added: pkgsrc/textproc/word2vec/PLIST
@comment $NetBSD: PLIST,v 1.1 2019/12/02 02:00:41 minskim Exp $
bin/compute-accuracy
bin/distance
bin/word-analogy
bin/word2phrase
bin/word2vec

File Added: pkgsrc/textproc/word2vec/distinfo
$NetBSD: distinfo,v 1.1 2019/12/02 02:00:41 minskim Exp $

SHA1 (word2vec-0.1c-20c129af10659f7c50e86e3be406df663beff438.tar.gz) = 4f0e872348d60223ba3b8412c0b9ccd7dbd07551
RMD160 (word2vec-0.1c-20c129af10659f7c50e86e3be406df663beff438.tar.gz) = de98886c52303242566eacd5a3eaf4459026bd71
SHA512 (word2vec-0.1c-20c129af10659f7c50e86e3be406df663beff438.tar.gz) = 698fa7e2e3ce3be4e4ecbe59bfe7f83640f4bc004b089b2b2cd9daa8233e98fbc5b541433317c647a0c796dd9aa2cd3aa186a1f8287e9f536104ed5fc6c1f65c
Size (word2vec-0.1c-20c129af10659f7c50e86e3be406df663beff438.tar.gz) = 104875 bytes
SHA1 (patch-makefile) = 2e32c5af8922008c2961fb2a7a4f59fd31ae0df9
SHA1 (patch-word2phrase.c) = 47ccf0897b76960a6ef48ddfffc60cc4c59afaee
SHA1 (patch-word2vec.c) = 1f0e2cf42c6156268f60075aa0a60ab750bc8bfd

File Added: pkgsrc/textproc/word2vec/patches/patch-makefile
$NetBSD: patch-makefile,v 1.1 2019/12/02 02:00:41 minskim Exp $

Do not override compiler set by pkgsrc.

--- makefile.orig	2017-07-16 22:46:08.000000000 +0000
+++ makefile
@@ -1,6 +1,6 @@
-CC = gcc
+CC?= gcc
 #Using -Ofast instead of -O3 might result in faster code, but is supported only by newer GCC versions
-CFLAGS = -lm -pthread -O3 -march=native -Wall -funroll-loops -Wno-unused-result
+CFLAGS+= -lm -pthread -Wall -funroll-loops -Wno-unused-result
 
 all: word2vec word2phrase distance word-analogy compute-accuracy
 

File Added: pkgsrc/textproc/word2vec/patches/patch-word2phrase.c
$NetBSD: patch-word2phrase.c,v 1.1 2019/12/02 02:00:41 minskim Exp $

Portability fixes.
https://github.com/tmikolov/word2vec/pull/40

--- word2phrase.c.orig	2017-07-16 22:46:08.000000000 +0000
+++ word2phrase.c
@@ -42,7 +42,7 @@ unsigned long long next_random = 1;
 void ReadWord(char *word, FILE *fin, char *eof) {
   int a = 0, ch;
   while (1) {
-    ch = fgetc_unlocked(fin);
+    ch = getc_unlocked(fin);
     if (ch == EOF) {
       *eof = 1;
       break;
@@ -246,7 +246,7 @@ void TrainModel() {
     if (eof) break;
     if (word[0] == '\n') {
       //fprintf(fo, "\n");
-      fputc_unlocked('\n', fo);
+      putc_unlocked('\n', fo);
       continue;
     }
     cn++;
@@ -286,12 +286,12 @@ void TrainModel() {
     next_random = next_random * (unsigned long long)25214903917 + 11;
     //if (next_random & 0x10000) score = 0;
     if (score > threshold) {
-      fputc_unlocked('_', fo);
+      putc_unlocked('_', fo);
       pb = 0;
-    } else fputc_unlocked(' ', fo);
+    } else putc_unlocked(' ', fo);
     a = 0;
     while (word[a]) {
-      fputc_unlocked(word[a], fo);
+      putc_unlocked(word[a], fo);
       a++;
     }
     pa = pb;

File Added: pkgsrc/textproc/word2vec/patches/patch-word2vec.c
$NetBSD: patch-word2vec.c,v 1.1 2019/12/02 02:00:41 minskim Exp $

Portability fix.
https://github.com/tmikolov/word2vec/pull/40

--- word2vec.c.orig	2017-07-16 22:46:08.000000000 +0000
+++ word2vec.c
@@ -71,7 +71,7 @@ void InitUnigramTable() {
 void ReadWord(char *word, FILE *fin, char *eof) {
   int a = 0, ch;
   while (1) {
-    ch = fgetc_unlocked(fin);
+    ch = getc_unlocked(fin);
     if (ch == EOF) {
       *eof = 1;
       break;