Wed Nov 24 15:56:18 2021 UTC ()
nltk_data: add shared files for nltk_data packages

This also includes a tool to create these packages.


(wiz)
diff -r0 -r1.1 pkgsrc/meta-pkgs/nltk_data/common.mk
diff -r0 -r1.1 pkgsrc/meta-pkgs/nltk_data/howto.md
diff -r0 -r1.1 pkgsrc/meta-pkgs/nltk_data/split.py

File Added: pkgsrc/meta-pkgs/nltk_data/common.mk
# $NetBSD: common.mk,v 1.1 2021/11/24 15:56:18 wiz Exp $

MASTER_SITES=	https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/${TYPE}/
EXTRACT_SUFX?=	.zip

MAINTAINER?=	pkgsrc-users@NetBSD.org
HOMEPAGE?=	https://www.nltk.org/data.html
COMMENT?=	Natural Language Toolkit (NLTK) Data

INSTALLATION_DIRS+=	share/nltk_data/${TYPE}

UNPACK?=	no

do-build:

.if ${UNPACK} == "no"
do-install:
	${INSTALL_DATA} ${_DISTDIR}/${DISTNAME}${EXTRACT_SUFX} ${DESTDIR}${PREFIX}/share/nltk_data/${TYPE}
.else
USE_TOOLS+=	pax

do-install:
	cd ${WRKDIR} && ${PAX} -pp -rw ${DISTNAME} ${DESTDIR}${PREFIX}/share/nltk_data/${TYPE}/
.endif

File Added: pkgsrc/meta-pkgs/nltk_data/howto.md
# Sources

Fetch https://www.nltk.org/nltk_data/ which is an XML file with an XSL
stylesheet

    wget -O nltk_data.xml  https://www.nltk.org/nltk_data/

should work.
This file contains one line per data, as of 2021-11-24 there are 108 entries,
and some meta package information.

# Generating the packages

Update the date in `split.py` and run it:

	split.py
	
It will generate one package for each entry in the list in textproc/nltk_data-${id}
You'll then need to run 'make mdi' in each directory. If the package existed
before, make sure that the data really changed (distinfo checksums/size differ)
before committing.

File Added: pkgsrc/meta-pkgs/nltk_data/split.py
#!/usr/bin/env python3

import os
import xml.etree.ElementTree as ET

tree = ET.parse('nltk_data.xml')

root = tree.getroot()

for child in root[0]:
    id = child.attrib["id"]
    path = f"/usr/pkgsrc/textproc/nltk_data-{id}"
    try:
        os.mkdir(path)
    except Exception:
        pass
    name = child.attrib["name"]
    if "webpage" in child.attrib:
        webpage = "HOMEPAGE=\t" + child.attrib["webpage"]
    else:
        webpage = ""
    if "license" in child.attrib:
        license = child.attrib["license"]
    subdir = child.attrib["subdir"]
    url = child.attrib["url"]
    with open(path + "/Makefile", "w") as f:
        print(f"""# $NetBSD: split.py,v 1.1 2021/11/24 15:56:18 wiz Exp $

DISTNAME=	{id}
PKGNAME=	nltk_data-{id}-20211124
CATEGORIES=	textproc
DIST_SUBDIR=	${{PKGNAME_NOREV}}

{webpage}
COMMENT=	NLTK Data - {name}
#LICENSE=	{license}

TYPE=		{subdir}

.include "../../meta-pkgs/nltk_data/common.mk"
.include "../../mk/bsd.pkg.mk"
""", file=f, end='')
    with open(path + "/DESCR", "w") as f:
        print(f"""This package contains data for NLTK, the Natural Language Toolkit.

This package contains data from/for {name}.""", file=f)
    with open(path + "/PLIST", "w") as f:
        print(f"""@comment $NetBSD: split.py,v 1.1 2021/11/24 15:56:18 wiz Exp $
share/nltk/{subdir}/{id}.zip""", file=f)