Fri Apr 18 21:13:10 2008 UTC ()
libfetch-2.5:
Add basic index parsing support for HTTP based on the ftpio.c code in
pkg_install. Permission to use the 3-clause BSD license from Thomas
Klausner in private mail.


(joerg)
diff -r1.7 -r1.8 pkgsrc/net/libfetch/Makefile
diff -r1.13 -r1.14 pkgsrc/net/libfetch/files/http.c

cvs diff -r1.7 -r1.8 pkgsrc/net/libfetch/Makefile (expand / switch to context diff)
--- pkgsrc/net/libfetch/Makefile 2008/04/16 01:01:50 1.7
+++ pkgsrc/net/libfetch/Makefile 2008/04/18 21:13:10 1.8
@@ -1,7 +1,7 @@
-# $NetBSD: Makefile,v 1.7 2008/04/16 01:01:50 joerg Exp $
+# $NetBSD: Makefile,v 1.8 2008/04/18 21:13:10 joerg Exp $
 #
 
-DISTNAME=	libfetch-2.4
+DISTNAME=	libfetch-2.5
 CATEGORIES=	net
 MASTER_SITES=	# empty
 DISTFILES=	# empty

cvs diff -r1.13 -r1.14 pkgsrc/net/libfetch/files/http.c (expand / switch to context diff)
--- pkgsrc/net/libfetch/files/http.c 2008/04/16 15:10:18 1.13
+++ pkgsrc/net/libfetch/files/http.c 2008/04/18 21:13:10 1.14
@@ -1,6 +1,8 @@
-/*	$NetBSD: http.c,v 1.13 2008/04/16 15:10:18 joerg Exp $	*/
+/*	$NetBSD: http.c,v 1.14 2008/04/18 21:13:10 joerg Exp $	*/
 /*-
  * Copyright (c) 2000-2004 Dag-Erling Coïdan Smørgrav
+ * Copyright (c) 2003 Thomas Klausner <wiz@NetBSD.org>
+ * Copyright (c) 2008 Joerg Sonnenberger <joerg@NetBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -1150,14 +1152,202 @@
 	return (0);
 }
 
+enum http_states {
+	ST_NONE,
+	ST_LT,
+	ST_LTA,
+	ST_TAGA,
+	ST_H,
+	ST_R,
+	ST_E,
+	ST_F,
+	ST_HREF,
+	ST_HREFQ,
+	ST_TAG,
+	ST_TAGAX,
+	ST_TAGAQ
+};
+
+struct index_parser {
+	enum http_states state;
+	struct url_ent *ue;
+	int list_size, list_len;
+};
+
+static size_t
+parse_index(struct index_parser *parser, const char *buf, size_t len)
+{
+	char *end_attr, p = *buf;
+
+	switch (parser->state) {
+	case ST_NONE:
+		/* Plain text, not in markup */
+		if (p == '<')
+			parser->state = ST_LT;
+		return 1;
+	case ST_LT:
+		/* In tag -- "<" already found */
+		if (p == '>')
+			parser->state = ST_NONE;
+		else if (p == 'a' || p == 'A')
+			parser->state = ST_LTA;
+		else if (!isspace((unsigned char)p))
+			parser->state = ST_TAG;
+		return 1;
+	case ST_LTA:
+		/* In tag -- "<a" already found */
+		if (p == '>')
+			parser->state = ST_NONE;
+		else if (p == '"')
+			parser->state = ST_TAGAQ;
+		else if (isspace((unsigned char)p))
+			parser->state = ST_TAGA;
+		else
+			parser->state = ST_TAG;
+		return 1;
+	case ST_TAG:
+		/* In tag, but not "<a" -- disregard */
+		if (p == '>')
+			parser->state = ST_NONE;
+		return 1;
+	case ST_TAGA:
+		/* In a-tag -- "<a " already found */
+		if (p == '>')
+			parser->state = ST_NONE;
+		else if (p == '"')
+			parser->state = ST_TAGAQ;
+		else if (p == 'h' || p == 'H')
+			parser->state = ST_H;
+		else if (!isspace((unsigned char)p))
+			parser->state = ST_TAGAX;
+		return 1;
+	case ST_TAGAX:
+		/* In unknown keyword in a-tag */
+		if (p == '>')
+			parser->state = ST_NONE;
+		else if (p == '"')
+			parser->state = ST_TAGAQ;
+		else if (isspace((unsigned char)p))
+			parser->state = ST_TAGA;
+		return 1;
+	case ST_TAGAQ:
+		/* In a-tag, unknown argument for keys. */
+		if (p == '>')
+			parser->state = ST_NONE;
+		else if (p == '"')
+			parser->state = ST_TAGA;
+		return 1;
+	case ST_H:
+		/* In a-tag -- "<a h" already found */
+		if (p == '>')
+			parser->state = ST_NONE;
+		else if (p == '"')
+			parser->state = ST_TAGAQ;
+		else if (p == 'r' || p == 'R')
+			parser->state = ST_R;
+		else if (isspace((unsigned char)p))
+			parser->state = ST_TAGA;
+		else
+			parser->state = ST_TAGAX;
+		return 1;
+	case ST_R:
+		/* In a-tag -- "<a hr" already found */
+		if (p == '>')
+			parser->state = ST_NONE;
+		else if (p == '"')
+			parser->state = ST_TAGAQ;
+		else if (p == 'e' || p == 'E')
+			parser->state = ST_E;
+		else if (isspace((unsigned char)p))
+			parser->state = ST_TAGA;
+		else
+			parser->state = ST_TAGAX;
+		return 1;
+	case ST_E:
+		/* In a-tag -- "<a hre" already found */
+		if (p == '>')
+			parser->state = ST_NONE;
+		else if (p == '"')
+			parser->state = ST_TAGAQ;
+		else if (p == 'f' || p == 'F')
+			parser->state = ST_F;
+		else if (isspace((unsigned char)p))
+			parser->state = ST_TAGA;
+		else
+			parser->state = ST_TAGAX;
+		return 1;
+	case ST_F:
+		/* In a-tag -- "<a href" already found */
+		if (p == '>')
+			parser->state = ST_NONE;
+		else if (p == '"')
+			parser->state = ST_TAGAQ;
+		else if (p == '=')
+			parser->state = ST_HREF;
+		else if (!isspace((unsigned char)p))
+			parser->state = ST_TAGAX;
+		return 1;
+	case ST_HREF:
+		/* In a-tag -- "<a href=" already found */
+		if (p == '>')
+			parser->state = ST_NONE;
+		else if (p == '"')
+			parser->state = ST_HREFQ;
+		else if (!isspace((unsigned char)p))
+			parser->state = ST_TAGA;
+		return 1;
+	case ST_HREFQ:
+		/* In href of the a-tag */
+		end_attr = memchr(buf, '"', len);
+		if (end_attr == NULL)
+			return 0;
+		*end_attr = '\0';
+		parser->state = ST_TAGA;
+		fetch_add_entry(&parser->ue, &parser->list_size, &parser->list_len, buf, NULL);
+		return end_attr + 1 - buf;
+	}
+	abort();
+}
+
 /*
  * List a directory
  */
 struct url_ent *
 fetchFilteredListHTTP(struct url *url, const char *pattern, const char *flags)
 {
-	fprintf(stderr, "fetchFilteredListHTTP(): not implemented\n");
-	return (NULL);
+	fetchIO *f;
+	char buf[2 * PATH_MAX];
+	size_t buf_len, processed, sum_processed;
+	ssize_t read_len;
+	struct index_parser state;
+
+	state.state = ST_NONE;
+	state.ue = NULL;
+	state.list_size = state.list_len = 0;
+
+	f = fetchGetHTTP(url, flags);
+	if (f == NULL)
+		return NULL;
+
+	buf_len = 0;
+
+	while ((read_len = fetchIO_read(f, buf + buf_len, sizeof(buf) - buf_len)) > 0) {
+		buf_len += read_len;
+		sum_processed = 0;
+		do {
+			processed = parse_index(&state, buf + sum_processed, buf_len);
+			buf_len -= processed;
+			sum_processed += processed;
+		} while (processed != 0 && buf_len > 0);
+		memmove(buf, buf + sum_processed, buf_len);
+	}
+
+	fetchIO_close(f);
+	if (read_len < 0) {
+		free(state.ue);
+		state.ue = NULL;
+	}
+	return state.ue;
 }
 
 /*
@@ -1166,6 +1356,5 @@
 struct url_ent *
 fetchListHTTP(struct url *url, const char *flags)
 {
-	fprintf(stderr, "fetchListHTTP(): not implemented\n");
-	return (NULL);
+	return fetchFilteredList(url, "*", flags);
 }