diff --git a/.gitignore b/.gitignore index db7c4cf..6d8d72b 100644 --- a/.gitignore +++ b/.gitignore @@ -67,3 +67,4 @@ gumbocy.html venv/ *.rst gumbo-parser +/tests/_benchmark_fixture.html diff --git a/.travis.yml b/.travis.yml index 4fbcdbc..dc63436 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,7 +11,8 @@ before_install: - docker ps - docker info - docker version -# - docker pull commonsearch/gumbocy + - ./scripts/git-set-file-times + - docker pull commonsearch/gumbocy - make docker_build script: diff --git a/Dockerfile b/Dockerfile index a6b57f4..b1b1bc5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -44,3 +44,11 @@ RUN curl -L 'https://bitbucket.org/squeaky/portable-pypy/downloads/pypy-5.3.1-li RUN /opt/pypy/bin/pypy -m ensurepip RUN /opt/pypy/bin/pip install -r /requirements.txt RUN /opt/pypy/bin/pip install -r /requirements-benchmark.txt + +# Install RE2 +RUN mkdir -p /tmp/re2 && \ + curl -L 'https://github.com/google/re2/archive/636bc71728b7488c43f9441ecfc80bdb1905b3f0.tar.gz' -o /tmp/re2/re2.tar.gz && \ + cd /tmp/re2 && tar zxvf re2.tar.gz --strip-components=1 && \ + make && make install && \ + rm -rf /tmp/re2 && \ + ldconfig diff --git a/README.md b/README.md index 603558b..f80a1c9 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # gumbocy +[![Build Status](https://travis-ci.org/commonsearch/gumbocy.svg?branch=master)](https://travis-ci.org/commonsearch/gumbocy) [![Apache License 2.0](https://img.shields.io/github/license/commonsearch/gumbocy.svg)](LICENSE) + **gumbocy** is an alternative Python binding for the excellent [Gumbo](https://github.com/google/gumbo-parser) HTML5 parser, originally written for [Common Search](http://about.commonsearch.org). It differs from the [official Python binding](https://github.com/google/gumbo-parser/tree/master/python/gumbo) in a few ways: @@ -7,7 +9,8 @@ It differs from the [official Python binding](https://github.com/google/gumbo-pa - It is optimized for performance by using [Cython](http://cython.org/). - It has a smaller feature set and doesn't aim to be a general-purpose binding. - Its `listnodes()` API just returns nodes as a flat list of tuples. - - It is generally restrictive: attributes have to be whitelisted. + - Its `analyze()` API traverses the HTML tree and returns high-level data like groups of words and lists of hyperlinks. + - It is generally restrictive. For instance, attributes have to be whitelisted. ## Installation @@ -58,31 +61,41 @@ make test ``` import gumbocy -parser = gumbocy.HTMLParser("""Helloworld!""") -parser.parse() -print parser.listnodes(options={}) +parser = gumbocy.HTMLParser(options={}) +parser.parse("""Helloworld!""") +print parser.listnodes() => [(0, "html"), (1, "head"), (2, "title"), (3, None, "Hello"), (1, "body"), (2, None, "world!")] + +print parser.analyze() + +=> {'word_groups': [('world!', 'body')], 'external_hyperlinks': [], 'internal_hyperlinks': [], 'title': 'Hello'} + ``` -For more usage examples, see the [tests](https://github.com/commonsearch/gumbocy/blob/master/tests/test_basic.py). +For more usage examples, see the [tests](https://github.com/commonsearch/gumbocy/blob/master/tests/). ## Options reference - - **attributes_whitelist**: a set of attributes which, if present, will be returned in a dict as the 3rd element of a node tuple. Note that "class" is returned as a frozenset. Defaults to `set()`. + - **attributes_whitelist**: a set of attributes which, if present, will be returned in a dict as the 3rd element of a node tuple by `listnodes()`. Note that "class" is returned as a frozenset. Defaults to `set()`. - **nesting_limit**: an integer to specify the maximum nesting level that will be returned. Defaults to `999`. - **head_only**: a boolean that will make gumbocy return only the elements in the of the document. Useful for parsing only tags for instance. Defaults to `False`. - **tags_ignore**: a list of tag names that won't be returned (as well as their children). - - **ids_ignore**: a list of IDs for which matching elements (and their children) won't be returned. "id" needs to be in `attributes_whitelist` for this to work. - - **classes_ignore**: a list of classes for which matching elements (and their children) won't be returned. "class" needs to be in `attributes_whitelist` for this to work. + - **ids_ignore**: a list of IDs for which matching elements (and their children) won't be returned. + - **classes_ignore**: a list of classes for which matching elements (and their children) won't be returned. ## Contributing If you are using Sublime Text, we recommend installing [Cython support](https://github.com/NotSqrt/sublime-cython). -All contributions are welcome! Feel free to use the Issues tab or send us your Pull Requests. +All contributions are welcome! Feel free to use the [Issues tab](https://github.com/commonsearch/gumbocy/issues) or send us your Pull Requests. ## Changelog -### 0.1: Initial public release \ No newline at end of file +### 0.2 + - New `analyze()` API, moving most of the tree traversal that was happening in `cosr-back` to Cython, resulting in a ~3x speedup in indexing speed. + - More tests + +### 0.1 + - Initial public release diff --git a/gumbocy.cpp b/gumbocy.cpp index 999d227..f3ac9fa 100644 --- a/gumbocy.cpp +++ b/gumbocy.cpp @@ -270,12 +270,18 @@ static CYTHON_INLINE float __PYX_NAN() { #define __PYX_HAVE__gumbocy #define __PYX_HAVE_API__gumbocy #include "gumbo.h" -#include +#include "string.h" +#include #include "ios" #include "new" #include "stdexcept" #include "typeinfo" +#include "re2/stringpiece.h" +#include "re2/re2.h" +#include #include +#include +#include #include "stdio.h" #ifdef _OPENMP #include @@ -496,37 +502,116 @@ static const char *__pyx_f[] = { }; /*--- Type declarations ---*/ +struct __pyx_obj_7gumbocy_Attributes; struct __pyx_obj_7gumbocy_HTMLParser; -/* "gumbocy.pyx":13 +/* "re2cy.pxd":3 + * from libcpp.string cimport string + * + * ctypedef Arg* ArgPtr # <<<<<<<<<<<<<< + * + * + */ +typedef RE2::Arg *__pyx_t_5re2cy_ArgPtr; + +/* "gumbocy.pyx":25 + * _RE_SPLIT_WHITESPACE = re.compile(r"\s+") + * + * ctypedef enum AttributeNames: # <<<<<<<<<<<<<< + * ATTR_ID, + * ATTR_ROLE, + */ +enum __pyx_t_7gumbocy_AttributeNames { + __pyx_e_7gumbocy_ATTR_ID, + __pyx_e_7gumbocy_ATTR_ROLE, + __pyx_e_7gumbocy_ATTR_HREF, + __pyx_e_7gumbocy_ATTR_STYLE, + __pyx_e_7gumbocy_ATTR_REL, + __pyx_e_7gumbocy_ATTR_SRC, + __pyx_e_7gumbocy_ATTR_ALT, + __pyx_e_7gumbocy_ATTR_NAME, + __pyx_e_7gumbocy_ATTR_PROPERTY, + __pyx_e_7gumbocy_ATTR_CONTENT +}; +typedef enum __pyx_t_7gumbocy_AttributeNames __pyx_t_7gumbocy_AttributeNames; + +/* "gumbocy.pyx":54 + * # map[AttributeNames, const char*] values * + * cdef class Attributes: # <<<<<<<<<<<<<< + * cdef int size_classes + * cdef dict values + */ +struct __pyx_obj_7gumbocy_Attributes { + PyObject_HEAD + int size_classes; + PyObject *values; + PyObject *classes; + int has_hidden; +}; + + +/* "gumbocy.pyx":65 + * # ctypedef sAttributes Attributes * * cdef class HTMLParser: # <<<<<<<<<<<<<< * - * cdef char* html + * # Global parser variables */ struct __pyx_obj_7gumbocy_HTMLParser { PyObject_HEAD struct __pyx_vtabstruct_7gumbocy_HTMLParser *__pyx_vtab; - char *html; - GumboOutput *output; - PyObject *nodes; int nesting_limit; int head_only; int has_ids_ignore; int has_classes_ignore; + int has_ids_hidden; + int has_classes_hidden; int has_attributes_whitelist; + int has_classes_boilerplate; + int has_ids_boilerplate; + int has_roles_boilerplate; + int has_metas_whitelist; std::unordered_set tags_ignore; std::unordered_set tags_ignore_head_only; - PyObject *classes_ignore; - PyObject *attributes_whitelist; - PyObject *ids_ignore; + std::unordered_set tags_boilerplate; + std::unordered_set tags_boilerplate_bypass; + std::unordered_set tags_separators; + re2::RE2 *attributes_whitelist; + re2::RE2 *metas_whitelist; + re2::RE2 *classes_ignore; + re2::RE2 *ids_ignore; + re2::RE2 *classes_hidden; + re2::RE2 *ids_hidden; + re2::RE2 *classes_boilerplate; + re2::RE2 *ids_boilerplate; + re2::RE2 *roles_boilerplate; + int analyze_internal_hyperlinks; + int analyze_external_hyperlinks; + int analyze_word_groups; + PyObject *current_stack; + PyObject *analysis; + PyObject *current_word_group; + PyObject *current_hyperlink; + int has_output; + GumboOutput *output; + PyObject *nodes; }; struct __pyx_vtabstruct_7gumbocy_HTMLParser { - int (*_traverse_node)(struct __pyx_obj_7gumbocy_HTMLParser *, int, GumboNode *); + int (*guess_node_hidden)(struct __pyx_obj_7gumbocy_HTMLParser *, GumboNode *, struct __pyx_obj_7gumbocy_Attributes *); + int (*guess_node_boilerplate)(struct __pyx_obj_7gumbocy_HTMLParser *, GumboNode *, struct __pyx_obj_7gumbocy_Attributes *); + struct __pyx_obj_7gumbocy_Attributes *(*get_attributes)(struct __pyx_obj_7gumbocy_HTMLParser *, GumboNode *); + void (*close_word_group)(struct __pyx_obj_7gumbocy_HTMLParser *); + void (*add_text)(struct __pyx_obj_7gumbocy_HTMLParser *, PyObject *); + void (*add_hyperlink_text)(struct __pyx_obj_7gumbocy_HTMLParser *, PyObject *); + void (*open_hyperlink)(struct __pyx_obj_7gumbocy_HTMLParser *, struct __pyx_obj_7gumbocy_Attributes *); + void (*close_hyperlink)(struct __pyx_obj_7gumbocy_HTMLParser *); + int (*_traverse_node)(struct __pyx_obj_7gumbocy_HTMLParser *, int, GumboNode *, int, int, int, int, int); + int (*_traverse_node_simple)(struct __pyx_obj_7gumbocy_HTMLParser *, int, GumboNode *); + PyObject *(*free)(struct __pyx_obj_7gumbocy_HTMLParser *); }; static struct __pyx_vtabstruct_7gumbocy_HTMLParser *__pyx_vtabptr_7gumbocy_HTMLParser; @@ -625,62 +710,12 @@ static int __Pyx_ParseOptionalKeywords(PyObject *kwds, PyObject **argnames[],\ static void __Pyx_RaiseArgtupleInvalid(const char* func_name, int exact, Py_ssize_t num_min, Py_ssize_t num_max, Py_ssize_t num_found); -/* ListAppend.proto */ -#if CYTHON_COMPILING_IN_CPYTHON -static CYTHON_INLINE int __Pyx_PyList_Append(PyObject* list, PyObject* x) { - PyListObject* L = (PyListObject*) list; - Py_ssize_t len = Py_SIZE(list); - if (likely(L->allocated > len) & likely(len > (L->allocated >> 1))) { - Py_INCREF(x); - PyList_SET_ITEM(list, len, x); - Py_SIZE(list) = len+1; - return 0; - } - return PyList_Append(list, x); -} -#else -#define __Pyx_PyList_Append(L,x) PyList_Append(L,x) -#endif - -/* IncludeStringH.proto */ -#include - -/* BytesEquals.proto */ -static CYTHON_INLINE int __Pyx_PyBytes_Equals(PyObject* s1, PyObject* s2, int equals); - -/* PyObjectCall.proto */ -#if CYTHON_COMPILING_IN_CPYTHON -static CYTHON_INLINE PyObject* __Pyx_PyObject_Call(PyObject *func, PyObject *arg, PyObject *kw); -#else -#define __Pyx_PyObject_Call(func, arg, kw) PyObject_Call(func, arg, kw) -#endif - -/* SliceObject.proto */ -static CYTHON_INLINE PyObject* __Pyx_PyObject_GetSlice( - PyObject* obj, Py_ssize_t cstart, Py_ssize_t cstop, - PyObject** py_start, PyObject** py_stop, PyObject** py_slice, - int has_cstart, int has_cstop, int wraparound); - -/* PyObjectCallMethO.proto */ -#if CYTHON_COMPILING_IN_CPYTHON -static CYTHON_INLINE PyObject* __Pyx_PyObject_CallMethO(PyObject *func, PyObject *arg); -#endif - -/* PyObjectCallOneArg.proto */ -static CYTHON_INLINE PyObject* __Pyx_PyObject_CallOneArg(PyObject *func, PyObject *arg); - -/* PyObjectCallNoArg.proto */ -#if CYTHON_COMPILING_IN_CPYTHON -static CYTHON_INLINE PyObject* __Pyx_PyObject_CallNoArg(PyObject *func); -#else -#define __Pyx_PyObject_CallNoArg(func) __Pyx_PyObject_Call(func, __pyx_empty_tuple, NULL) -#endif +/* ArgTypeTest.proto */ +static CYTHON_INLINE int __Pyx_ArgTypeTest(PyObject *obj, PyTypeObject *type, int none_allowed, + const char *name, int exact); -/* PySequenceContains.proto */ -static CYTHON_INLINE int __Pyx_PySequence_ContainsTF(PyObject* item, PyObject* seq, int eq) { - int result = PySequence_Contains(seq, item); - return unlikely(result < 0) ? result : (result == (eq == Py_EQ)); -} +/* dict_getitem_default.proto */ +static PyObject* __Pyx_PyDict_GetItemDefault(PyObject* d, PyObject* key, PyObject* default_value); /* pyfrozenset_new.proto */ static CYTHON_INLINE PyObject* __Pyx_PyFrozenSet_New(PyObject* it) { @@ -714,8 +749,44 @@ static CYTHON_INLINE PyObject* __Pyx_PyFrozenSet_New(PyObject* it) { #endif } -/* GetModuleGlobalName.proto */ -static CYTHON_INLINE PyObject *__Pyx_GetModuleGlobalName(PyObject *name); +/* StringJoin.proto */ +#if PY_MAJOR_VERSION < 3 +#define __Pyx_PyString_Join __Pyx_PyBytes_Join +#define __Pyx_PyBaseString_Join(s, v) (PyUnicode_CheckExact(s) ? PyUnicode_Join(s, v) : __Pyx_PyBytes_Join(s, v)) +#else +#define __Pyx_PyString_Join PyUnicode_Join +#define __Pyx_PyBaseString_Join PyUnicode_Join +#endif +#if CYTHON_COMPILING_IN_CPYTHON + #if PY_MAJOR_VERSION < 3 + #define __Pyx_PyBytes_Join _PyString_Join + #else + #define __Pyx_PyBytes_Join _PyBytes_Join + #endif +#else +static CYTHON_INLINE PyObject* __Pyx_PyBytes_Join(PyObject* sep, PyObject* values); +#endif + +/* DictGetItem.proto */ +#if PY_MAJOR_VERSION >= 3 && !CYTHON_COMPILING_IN_PYPY +static PyObject *__Pyx_PyDict_GetItem(PyObject *d, PyObject* key) { + PyObject *value; + value = PyDict_GetItemWithError(d, key); + if (unlikely(!value)) { + if (!PyErr_Occurred()) { + PyObject* args = PyTuple_Pack(1, key); + if (likely(args)) + PyErr_SetObject(PyExc_KeyError, args); + Py_XDECREF(args); + } + return NULL; + } + Py_INCREF(value); + return value; +} +#else + #define __Pyx_PyDict_GetItem(d, key) PyObject_GetItem(d, key) +#endif /* PyThreadStateGet.proto */ #if CYTHON_COMPILING_IN_CPYTHON @@ -746,32 +817,150 @@ static void __Pyx_WriteUnraisable(const char *name, int clineno, int lineno, const char *filename, int full_traceback, int nogil); -/* ArgTypeTest.proto */ -static CYTHON_INLINE int __Pyx_ArgTypeTest(PyObject *obj, PyTypeObject *type, int none_allowed, - const char *name, int exact); +/* PySequenceContains.proto */ +static CYTHON_INLINE int __Pyx_PySequence_ContainsTF(PyObject* item, PyObject* seq, int eq) { + int result = PySequence_Contains(seq, item); + return unlikely(result < 0) ? result : (result == (eq == Py_EQ)); +} -/* dict_getitem_default.proto */ -static PyObject* __Pyx_PyDict_GetItemDefault(PyObject* d, PyObject* key, PyObject* default_value); +/* PyObjectCall.proto */ +#if CYTHON_COMPILING_IN_CPYTHON +static CYTHON_INLINE PyObject* __Pyx_PyObject_Call(PyObject *func, PyObject *arg, PyObject *kw); +#else +#define __Pyx_PyObject_Call(func, arg, kw) PyObject_Call(func, arg, kw) +#endif -/* DictGetItem.proto */ -#if PY_MAJOR_VERSION >= 3 && !CYTHON_COMPILING_IN_PYPY -static PyObject *__Pyx_PyDict_GetItem(PyObject *d, PyObject* key) { - PyObject *value; - value = PyDict_GetItemWithError(d, key); - if (unlikely(!value)) { - if (!PyErr_Occurred()) { - PyObject* args = PyTuple_Pack(1, key); - if (likely(args)) - PyErr_SetObject(PyExc_KeyError, args); - Py_XDECREF(args); - } - return NULL; +/* IncludeStringH.proto */ +#include + +/* BytesEquals.proto */ +static CYTHON_INLINE int __Pyx_PyBytes_Equals(PyObject* s1, PyObject* s2, int equals); + +/* GetModuleGlobalName.proto */ +static CYTHON_INLINE PyObject *__Pyx_GetModuleGlobalName(PyObject *name); + +/* PyObjectCallMethO.proto */ +#if CYTHON_COMPILING_IN_CPYTHON +static CYTHON_INLINE PyObject* __Pyx_PyObject_CallMethO(PyObject *func, PyObject *arg); +#endif + +/* PyObjectCallOneArg.proto */ +static CYTHON_INLINE PyObject* __Pyx_PyObject_CallOneArg(PyObject *func, PyObject *arg); + +/* PyObjectCallNoArg.proto */ +#if CYTHON_COMPILING_IN_CPYTHON +static CYTHON_INLINE PyObject* __Pyx_PyObject_CallNoArg(PyObject *func); +#else +#define __Pyx_PyObject_CallNoArg(func) __Pyx_PyObject_Call(func, __pyx_empty_tuple, NULL) +#endif + +/* ListAppend.proto */ +#if CYTHON_COMPILING_IN_CPYTHON +static CYTHON_INLINE int __Pyx_PyList_Append(PyObject* list, PyObject* x) { + PyListObject* L = (PyListObject*) list; + Py_ssize_t len = Py_SIZE(list); + if (likely(L->allocated > len) & likely(len > (L->allocated >> 1))) { + Py_INCREF(x); + PyList_SET_ITEM(list, len, x); + Py_SIZE(list) = len+1; + return 0; } - Py_INCREF(value); - return value; + return PyList_Append(list, x); } #else - #define __Pyx_PyDict_GetItem(d, key) PyObject_GetItem(d, key) +#define __Pyx_PyList_Append(L,x) PyList_Append(L,x) +#endif + +/* PyObjectCallMethod1.proto */ +static PyObject* __Pyx_PyObject_CallMethod1(PyObject* obj, PyObject* method_name, PyObject* arg); + +/* append.proto */ +static CYTHON_INLINE int __Pyx_PyObject_Append(PyObject* L, PyObject* x); + +/* GetItemInt.proto */ +#define __Pyx_GetItemInt(o, i, type, is_signed, to_py_func, is_list, wraparound, boundscheck)\ + (__Pyx_fits_Py_ssize_t(i, type, is_signed) ?\ + __Pyx_GetItemInt_Fast(o, (Py_ssize_t)i, is_list, wraparound, boundscheck) :\ + (is_list ? (PyErr_SetString(PyExc_IndexError, "list index out of range"), (PyObject*)NULL) :\ + __Pyx_GetItemInt_Generic(o, to_py_func(i)))) +#define __Pyx_GetItemInt_List(o, i, type, is_signed, to_py_func, is_list, wraparound, boundscheck)\ + (__Pyx_fits_Py_ssize_t(i, type, is_signed) ?\ + __Pyx_GetItemInt_List_Fast(o, (Py_ssize_t)i, wraparound, boundscheck) :\ + (PyErr_SetString(PyExc_IndexError, "list index out of range"), (PyObject*)NULL)) +static CYTHON_INLINE PyObject *__Pyx_GetItemInt_List_Fast(PyObject *o, Py_ssize_t i, + int wraparound, int boundscheck); +#define __Pyx_GetItemInt_Tuple(o, i, type, is_signed, to_py_func, is_list, wraparound, boundscheck)\ + (__Pyx_fits_Py_ssize_t(i, type, is_signed) ?\ + __Pyx_GetItemInt_Tuple_Fast(o, (Py_ssize_t)i, wraparound, boundscheck) :\ + (PyErr_SetString(PyExc_IndexError, "tuple index out of range"), (PyObject*)NULL)) +static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Tuple_Fast(PyObject *o, Py_ssize_t i, + int wraparound, int boundscheck); +static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Generic(PyObject *o, PyObject* j); +static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Fast(PyObject *o, Py_ssize_t i, + int is_list, int wraparound, int boundscheck); + +/* SetItemInt.proto */ +#define __Pyx_SetItemInt(o, i, v, type, is_signed, to_py_func, is_list, wraparound, boundscheck)\ + (__Pyx_fits_Py_ssize_t(i, type, is_signed) ?\ + __Pyx_SetItemInt_Fast(o, (Py_ssize_t)i, v, is_list, wraparound, boundscheck) :\ + (is_list ? (PyErr_SetString(PyExc_IndexError, "list assignment index out of range"), -1) :\ + __Pyx_SetItemInt_Generic(o, to_py_func(i), v))) +static CYTHON_INLINE int __Pyx_SetItemInt_Generic(PyObject *o, PyObject *j, PyObject *v); +static CYTHON_INLINE int __Pyx_SetItemInt_Fast(PyObject *o, Py_ssize_t i, PyObject *v, + int is_list, int wraparound, int boundscheck); + +/* SliceObject.proto */ +static CYTHON_INLINE PyObject* __Pyx_PyObject_GetSlice( + PyObject* obj, Py_ssize_t cstart, Py_ssize_t cstop, + PyObject** py_start, PyObject** py_stop, PyObject** py_slice, + int has_cstart, int has_cstop, int wraparound); + +/* PyObjectCallMethod2.proto */ +static PyObject* __Pyx_PyObject_CallMethod2(PyObject* obj, PyObject* method_name, PyObject* arg1, PyObject* arg2); + +/* dict_setdefault.proto */ +static CYTHON_INLINE PyObject *__Pyx_PyDict_SetDefault(PyObject *d, PyObject *key, PyObject *default_value, int is_safe_type); + +/* PyDictContains.proto */ +static CYTHON_INLINE int __Pyx_PyDict_ContainsTF(PyObject* item, PyObject* dict, int eq) { + int result = PyDict_Contains(dict, item); + return unlikely(result < 0) ? result : (result == (eq == Py_EQ)); +} + +/* PyObjectCallMethod0.proto */ +static PyObject* __Pyx_PyObject_CallMethod0(PyObject* obj, PyObject* method_name); + +/* pop.proto */ +static CYTHON_INLINE PyObject* __Pyx__PyObject_Pop(PyObject* L); +#if CYTHON_COMPILING_IN_CPYTHON +static CYTHON_INLINE PyObject* __Pyx_PyList_Pop(PyObject* L); +#define __Pyx_PyObject_Pop(L) (likely(PyList_CheckExact(L)) ?\ + __Pyx_PyList_Pop(L) : __Pyx__PyObject_Pop(L)) +#else +#define __Pyx_PyList_Pop(L) __Pyx__PyObject_Pop(L) +#define __Pyx_PyObject_Pop(L) __Pyx__PyObject_Pop(L) +#endif + +/* UnpackUnboundCMethod.proto */ +typedef struct { + PyObject *type; + PyObject **method_name; + PyCFunction func; + PyObject *method; + int flag; +} __Pyx_CachedCFunction; + +/* CallUnboundCMethod0.proto */ +static PyObject* __Pyx__CallUnboundCMethod0(__Pyx_CachedCFunction* cfunc, PyObject* self); +#if CYTHON_COMPILING_IN_CPYTHON +#define __Pyx_CallUnboundCMethod0(cfunc, self)\ + ((likely((cfunc)->func)) ?\ + (likely((cfunc)->flag == METH_NOARGS) ? (*((cfunc)->func))(self, NULL) :\ + (likely((cfunc)->flag == (METH_VARARGS | METH_KEYWORDS)) ? ((*(PyCFunctionWithKeywords)(cfunc)->func)(self, __pyx_empty_tuple, NULL)) :\ + ((cfunc)->flag == METH_VARARGS ? (*((cfunc)->func))(self, __pyx_empty_tuple) : __Pyx__CallUnboundCMethod0(cfunc, self)))) :\ + __Pyx__CallUnboundCMethod0(cfunc, self)) +#else +#define __Pyx_CallUnboundCMethod0(cfunc, self) __Pyx__CallUnboundCMethod0(cfunc, self) #endif /* SetVTable.proto */ @@ -802,20 +991,63 @@ static void __Pyx_AddTraceback(const char *funcname, int c_line, /* None.proto */ #include +/* CppExceptionConversion.proto */ +#ifndef __Pyx_CppExn2PyErr +#include +#include +#include +#include +static void __Pyx_CppExn2PyErr() { + try { + if (PyErr_Occurred()) + ; // let the latest Python exn pass through and ignore the current one + else + throw; + } catch (const std::bad_alloc& exn) { + PyErr_SetString(PyExc_MemoryError, exn.what()); + } catch (const std::bad_cast& exn) { + PyErr_SetString(PyExc_TypeError, exn.what()); + } catch (const std::domain_error& exn) { + PyErr_SetString(PyExc_ValueError, exn.what()); + } catch (const std::invalid_argument& exn) { + PyErr_SetString(PyExc_ValueError, exn.what()); + } catch (const std::ios_base::failure& exn) { + PyErr_SetString(PyExc_IOError, exn.what()); + } catch (const std::out_of_range& exn) { + PyErr_SetString(PyExc_IndexError, exn.what()); + } catch (const std::overflow_error& exn) { + PyErr_SetString(PyExc_OverflowError, exn.what()); + } catch (const std::range_error& exn) { + PyErr_SetString(PyExc_ArithmeticError, exn.what()); + } catch (const std::underflow_error& exn) { + PyErr_SetString(PyExc_ArithmeticError, exn.what()); + } catch (const std::exception& exn) { + PyErr_SetString(PyExc_RuntimeError, exn.what()); + } + catch (...) + { + PyErr_SetString(PyExc_RuntimeError, "Unknown exception"); + } +} +#endif + /* CIntToPy.proto */ -static CYTHON_INLINE PyObject* __Pyx_PyInt_From_int(int value); +static CYTHON_INLINE PyObject* __Pyx_PyInt_From___pyx_t_7gumbocy_AttributeNames(__pyx_t_7gumbocy_AttributeNames value); /* CIntToPy.proto */ static CYTHON_INLINE PyObject* __Pyx_PyInt_From_unsigned_int(unsigned int value); -/* CIntFromPy.proto */ -static CYTHON_INLINE unsigned int __Pyx_PyInt_As_unsigned_int(PyObject *); +/* CIntToPy.proto */ +static CYTHON_INLINE PyObject* __Pyx_PyInt_From_long(long value); + +/* CIntToPy.proto */ +static CYTHON_INLINE PyObject* __Pyx_PyInt_From_int(int value); /* CIntFromPy.proto */ static CYTHON_INLINE int __Pyx_PyInt_As_int(PyObject *); -/* CIntToPy.proto */ -static CYTHON_INLINE PyObject* __Pyx_PyInt_From_long(long value); +/* CIntFromPy.proto */ +static CYTHON_INLINE unsigned int __Pyx_PyInt_As_unsigned_int(PyObject *); /* CIntFromPy.proto */ static CYTHON_INLINE long __Pyx_PyInt_As_long(PyObject *); @@ -826,94 +1058,260 @@ static int __Pyx_check_binary_version(void); /* InitStrings.proto */ static int __Pyx_InitStrings(__Pyx_StringTabEntry *t); -static int __pyx_f_7gumbocy_10HTMLParser__traverse_node(struct __pyx_obj_7gumbocy_HTMLParser *__pyx_v_self, int __pyx_v_level, GumboNode *__pyx_v_node); /* proto*/ +static int __pyx_f_7gumbocy_10HTMLParser_guess_node_hidden(struct __pyx_obj_7gumbocy_HTMLParser *__pyx_v_self, CYTHON_UNUSED GumboNode *__pyx_v_node, struct __pyx_obj_7gumbocy_Attributes *__pyx_v_attrs); /* proto*/ +static int __pyx_f_7gumbocy_10HTMLParser_guess_node_boilerplate(struct __pyx_obj_7gumbocy_HTMLParser *__pyx_v_self, GumboNode *__pyx_v_node, struct __pyx_obj_7gumbocy_Attributes *__pyx_v_attrs); /* proto*/ +static struct __pyx_obj_7gumbocy_Attributes *__pyx_f_7gumbocy_10HTMLParser_get_attributes(struct __pyx_obj_7gumbocy_HTMLParser *__pyx_v_self, GumboNode *__pyx_v_node); /* proto*/ +static void __pyx_f_7gumbocy_10HTMLParser_close_word_group(struct __pyx_obj_7gumbocy_HTMLParser *__pyx_v_self); /* proto*/ +static void __pyx_f_7gumbocy_10HTMLParser_add_text(struct __pyx_obj_7gumbocy_HTMLParser *__pyx_v_self, PyObject *__pyx_v_text); /* proto*/ +static void __pyx_f_7gumbocy_10HTMLParser_add_hyperlink_text(struct __pyx_obj_7gumbocy_HTMLParser *__pyx_v_self, PyObject *__pyx_v_text); /* proto*/ +static void __pyx_f_7gumbocy_10HTMLParser_open_hyperlink(struct __pyx_obj_7gumbocy_HTMLParser *__pyx_v_self, struct __pyx_obj_7gumbocy_Attributes *__pyx_v_attrs); /* proto*/ +static void __pyx_f_7gumbocy_10HTMLParser_close_hyperlink(struct __pyx_obj_7gumbocy_HTMLParser *__pyx_v_self); /* proto*/ +static int __pyx_f_7gumbocy_10HTMLParser__traverse_node(struct __pyx_obj_7gumbocy_HTMLParser *__pyx_v_self, int __pyx_v_level, GumboNode *__pyx_v_node, int __pyx_v_is_head, int __pyx_v_is_hidden, int __pyx_v_is_boilerplate, int __pyx_v_is_boilerplate_bypassed, int __pyx_v_is_hyperlink); /* proto*/ +static int __pyx_f_7gumbocy_10HTMLParser__traverse_node_simple(struct __pyx_obj_7gumbocy_HTMLParser *__pyx_v_self, int __pyx_v_level, GumboNode *__pyx_v_node); /* proto*/ +static PyObject *__pyx_f_7gumbocy_10HTMLParser_free(struct __pyx_obj_7gumbocy_HTMLParser *__pyx_v_self); /* proto*/ + +/* Module declarations from 'libc.string' */ + +/* Module declarations from 'libcpp.string' */ + +/* Module declarations from 're2cy' */ /* Module declarations from 'libcpp.utility' */ /* Module declarations from 'libcpp.unordered_set' */ +/* Module declarations from 'libcpp.vector' */ + +/* Module declarations from 'libcpp.map' */ + /* Module declarations from 'gumbocy' */ +static PyTypeObject *__pyx_ptype_7gumbocy_Attributes = 0; static PyTypeObject *__pyx_ptype_7gumbocy_HTMLParser = 0; +static std::vector<__pyx_t_5re2cy_ArgPtr> *__pyx_v_7gumbocy_argp; +static __pyx_t_5re2cy_ArgPtr *__pyx_v_7gumbocy_empty_args; +static re2::RE2 *__pyx_v_7gumbocy__RE2_SEARCH_STYLE_HIDDEN; +static re2::RE2 *__pyx_v_7gumbocy__RE2_EXTERNAL_HREF; +static re2::RE2 *__pyx_v_7gumbocy__RE2_IGNORED_HREF; +static int __pyx_f_7gumbocy_re2_search(char const *, re2::RE2 &); /*proto*/ #define __Pyx_MODULE_NAME "gumbocy" int __pyx_module_is_main_gumbocy = 0; /* Implementation of 'gumbocy' */ static PyObject *__pyx_builtin_range; -static const char __pyx_k_[] = ""; +static const char __pyx_k_[] = "^(?:"; static const char __pyx_k_s[] = "\\s+"; +static const char __pyx_k__2[] = "|"; +static const char __pyx_k__3[] = ")$"; +static const char __pyx_k__4[] = " "; +static const char __pyx_k__5[] = ""; +static const char __pyx_k__8[] = "//"; static const char __pyx_k_id[] = "id"; static const char __pyx_k_re[] = "re"; +static const char __pyx_k_alt[] = "alt"; static const char __pyx_k_get[] = "get"; -static const char __pyx_k_html[] = "html"; +static const char __pyx_k_pop[] = "pop"; +static const char __pyx_k_rel[] = "rel"; +static const char __pyx_k_src[] = "src"; +static const char __pyx_k_href[] = "href"; +static const char __pyx_k_http[] = "http://"; +static const char __pyx_k_join[] = "join"; static const char __pyx_k_main[] = "__main__"; +static const char __pyx_k_name[] = "name"; +static const char __pyx_k_role[] = "role"; static const char __pyx_k_test[] = "__test__"; +static const char __pyx_k_true[] = "true"; static const char __pyx_k_class[] = "class"; +static const char __pyx_k_https[] = "https://"; static const char __pyx_k_lower[] = "lower"; static const char __pyx_k_range[] = "range"; static const char __pyx_k_split[] = "split"; static const char __pyx_k_strip[] = "strip"; +static const char __pyx_k_style[] = "style"; +static const char __pyx_k_title[] = "title"; +static const char __pyx_k_append[] = "append"; +static const char __pyx_k_hidden[] = "hidden"; static const char __pyx_k_import[] = "__import__"; +static const char __pyx_k_article[] = "article"; static const char __pyx_k_compile[] = "compile"; +static const char __pyx_k_content[] = "content"; static const char __pyx_k_options[] = "options"; +static const char __pyx_k_base_url[] = "base_url"; +static const char __pyx_k_property[] = "property"; static const char __pyx_k_head_only[] = "head_only"; +static const char __pyx_k_head_links[] = "head_links"; +static const char __pyx_k_head_metas[] = "head_metas"; +static const char __pyx_k_ids_hidden[] = "ids_hidden"; static const char __pyx_k_ids_ignore[] = "ids_ignore"; static const char __pyx_k_pyx_vtable[] = "__pyx_vtable__"; +static const char __pyx_k_setdefault[] = "setdefault"; +static const char __pyx_k_startswith[] = "startswith"; +static const char __pyx_k_aria_hidden[] = "aria-hidden"; static const char __pyx_k_tags_ignore[] = "tags_ignore"; +static const char __pyx_k_word_groups[] = "word_groups"; static const char __pyx_k_nesting_limit[] = "nesting_limit"; +static const char __pyx_k_classes_hidden[] = "classes_hidden"; static const char __pyx_k_classes_ignore[] = "classes_ignore"; +static const char __pyx_k_ids_boilerplate[] = "ids_boilerplate"; +static const char __pyx_k_metas_whitelist[] = "metas_whitelist"; +static const char __pyx_k_tags_separators[] = "tags_separators"; +static const char __pyx_k_tags_boilerplate[] = "tags_boilerplate"; +static const char __pyx_k_roles_boilerplate[] = "roles_boilerplate"; static const char __pyx_k_RE_SPLIT_WHITESPACE[] = "_RE_SPLIT_WHITESPACE"; +static const char __pyx_k_analyze_word_groups[] = "analyze_word_groups"; +static const char __pyx_k_classes_boilerplate[] = "classes_boilerplate"; +static const char __pyx_k_external_hyperlinks[] = "external_hyperlinks"; +static const char __pyx_k_internal_hyperlinks[] = "internal_hyperlinks"; static const char __pyx_k_attributes_whitelist[] = "attributes_whitelist"; -static PyObject *__pyx_kp_b_; +static const char __pyx_k_tags_boilerplate_bypass[] = "tags_boilerplate_bypass"; +static const char __pyx_k_analyze_external_hyperlinks[] = "analyze_external_hyperlinks"; +static const char __pyx_k_analyze_internal_hyperlinks[] = "analyze_internal_hyperlinks"; +static PyObject *__pyx_kp_s_; static PyObject *__pyx_n_s_RE_SPLIT_WHITESPACE; +static PyObject *__pyx_kp_s__2; +static PyObject *__pyx_kp_s__3; +static PyObject *__pyx_kp_s__4; +static PyObject *__pyx_kp_b__5; +static PyObject *__pyx_kp_s__5; +static PyObject *__pyx_kp_s__8; +static PyObject *__pyx_n_b_alt; +static PyObject *__pyx_n_s_analyze_external_hyperlinks; +static PyObject *__pyx_n_s_analyze_internal_hyperlinks; +static PyObject *__pyx_n_s_analyze_word_groups; +static PyObject *__pyx_n_s_append; +static PyObject *__pyx_kp_b_aria_hidden; +static PyObject *__pyx_n_s_article; static PyObject *__pyx_n_s_attributes_whitelist; +static PyObject *__pyx_n_s_base_url; static PyObject *__pyx_n_b_class; +static PyObject *__pyx_n_s_class; +static PyObject *__pyx_n_s_classes_boilerplate; +static PyObject *__pyx_n_s_classes_hidden; static PyObject *__pyx_n_s_classes_ignore; static PyObject *__pyx_n_s_compile; +static PyObject *__pyx_n_b_content; +static PyObject *__pyx_n_s_content; +static PyObject *__pyx_n_s_external_hyperlinks; static PyObject *__pyx_n_s_get; +static PyObject *__pyx_n_s_head_links; +static PyObject *__pyx_n_s_head_metas; static PyObject *__pyx_n_s_head_only; -static PyObject *__pyx_n_s_html; +static PyObject *__pyx_n_b_hidden; +static PyObject *__pyx_n_b_href; +static PyObject *__pyx_n_s_href; +static PyObject *__pyx_kp_s_http; +static PyObject *__pyx_kp_s_https; +static PyObject *__pyx_n_b_id; static PyObject *__pyx_n_s_id; +static PyObject *__pyx_n_s_ids_boilerplate; +static PyObject *__pyx_n_s_ids_hidden; static PyObject *__pyx_n_s_ids_ignore; static PyObject *__pyx_n_s_import; +static PyObject *__pyx_n_s_internal_hyperlinks; +static PyObject *__pyx_n_s_join; static PyObject *__pyx_n_s_lower; static PyObject *__pyx_n_s_main; +static PyObject *__pyx_n_s_metas_whitelist; +static PyObject *__pyx_n_b_name; +static PyObject *__pyx_n_s_name; static PyObject *__pyx_n_s_nesting_limit; static PyObject *__pyx_n_s_options; +static PyObject *__pyx_n_s_pop; +static PyObject *__pyx_n_b_property; +static PyObject *__pyx_n_s_property; static PyObject *__pyx_n_s_pyx_vtable; static PyObject *__pyx_n_s_range; static PyObject *__pyx_n_s_re; +static PyObject *__pyx_n_b_rel; +static PyObject *__pyx_n_s_rel; +static PyObject *__pyx_n_b_role; +static PyObject *__pyx_n_s_role; +static PyObject *__pyx_n_s_roles_boilerplate; static PyObject *__pyx_kp_s_s; +static PyObject *__pyx_n_s_setdefault; static PyObject *__pyx_n_s_split; +static PyObject *__pyx_n_b_src; +static PyObject *__pyx_n_s_startswith; static PyObject *__pyx_n_s_strip; +static PyObject *__pyx_n_b_style; +static PyObject *__pyx_n_s_tags_boilerplate; +static PyObject *__pyx_n_s_tags_boilerplate_bypass; static PyObject *__pyx_n_s_tags_ignore; +static PyObject *__pyx_n_s_tags_separators; static PyObject *__pyx_n_s_test; -static int __pyx_pf_7gumbocy_10HTMLParser___cinit__(struct __pyx_obj_7gumbocy_HTMLParser *__pyx_v_self, char *__pyx_v_html); /* proto */ -static PyObject *__pyx_pf_7gumbocy_10HTMLParser_2parse(struct __pyx_obj_7gumbocy_HTMLParser *__pyx_v_self); /* proto */ -static PyObject *__pyx_pf_7gumbocy_10HTMLParser_4listnodes(struct __pyx_obj_7gumbocy_HTMLParser *__pyx_v_self, PyObject *__pyx_v_options); /* proto */ -static void __pyx_pf_7gumbocy_10HTMLParser_6__dealloc__(struct __pyx_obj_7gumbocy_HTMLParser *__pyx_v_self); /* proto */ +static PyObject *__pyx_n_s_title; +static PyObject *__pyx_n_b_true; +static PyObject *__pyx_n_s_word_groups; +static int __pyx_pf_7gumbocy_10HTMLParser___cinit__(struct __pyx_obj_7gumbocy_HTMLParser *__pyx_v_self, PyObject *__pyx_v_options); /* proto */ +static PyObject *__pyx_pf_7gumbocy_10HTMLParser_2parse(struct __pyx_obj_7gumbocy_HTMLParser *__pyx_v_self, char *__pyx_v_html); /* proto */ +static PyObject *__pyx_pf_7gumbocy_10HTMLParser_4analyze(struct __pyx_obj_7gumbocy_HTMLParser *__pyx_v_self); /* proto */ +static PyObject *__pyx_pf_7gumbocy_10HTMLParser_6listnodes(struct __pyx_obj_7gumbocy_HTMLParser *__pyx_v_self); /* proto */ +static void __pyx_pf_7gumbocy_10HTMLParser_8__dealloc__(struct __pyx_obj_7gumbocy_HTMLParser *__pyx_v_self); /* proto */ +static PyObject *__pyx_tp_new_7gumbocy_Attributes(PyTypeObject *t, PyObject *a, PyObject *k); /*proto*/ static PyObject *__pyx_tp_new_7gumbocy_HTMLParser(PyTypeObject *t, PyObject *a, PyObject *k); /*proto*/ +static __Pyx_CachedCFunction __pyx_umethod_PyList_Type_pop = {0, &__pyx_n_s_pop, 0, 0, 0}; static PyObject *__pyx_int_999; -static PyObject *__pyx_tuple__2; -static PyObject *__pyx_tuple__3; +static PyObject *__pyx_tuple__6; +static PyObject *__pyx_tuple__7; +static PyObject *__pyx_tuple__9; +static PyObject *__pyx_tuple__10; +static PyObject *__pyx_tuple__11; + +/* "gumbocy.pyx":16 + * cdef re2cy.ArgPtr *empty_args = &(deref(argp)[0]) + * + * cdef bint re2_search(const char* s, re2cy.RE2 &pattern): # <<<<<<<<<<<<<< + * return re2cy.RE2.PartialMatchN(s, pattern, empty_args, 0) + * + */ + +static int __pyx_f_7gumbocy_re2_search(char const *__pyx_v_s, re2::RE2 &__pyx_v_pattern) { + int __pyx_r; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("re2_search", 0); + + /* "gumbocy.pyx":17 + * + * cdef bint re2_search(const char* s, re2cy.RE2 &pattern): + * return re2cy.RE2.PartialMatchN(s, pattern, empty_args, 0) # <<<<<<<<<<<<<< + * + * cdef re2cy.RE2 *_RE2_SEARCH_STYLE_HIDDEN = new re2cy.RE2(r"(display\s*\:\s*none)|(visibility\s*\:\s*hidden)") + */ + __pyx_r = re2::RE2::PartialMatchN(__pyx_v_s, __pyx_v_pattern, __pyx_v_7gumbocy_empty_args, 0); + goto __pyx_L0; -/* "gumbocy.pyx":32 + /* "gumbocy.pyx":16 + * cdef re2cy.ArgPtr *empty_args = &(deref(argp)[0]) + * + * cdef bint re2_search(const char* s, re2cy.RE2 &pattern): # <<<<<<<<<<<<<< + * return re2cy.RE2.PartialMatchN(s, pattern, empty_args, 0) * + */ + + /* function exit code */ + __pyx_L0:; + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "gumbocy.pyx":112 + * cdef list nodes * - * def __cinit__(self, char* html): # <<<<<<<<<<<<<< - * self.html = html + * def __cinit__(self, dict options=None): # <<<<<<<<<<<<<< * + * options = options or {} */ /* Python wrapper */ static int __pyx_pw_7gumbocy_10HTMLParser_1__cinit__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/ static int __pyx_pw_7gumbocy_10HTMLParser_1__cinit__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds) { - char *__pyx_v_html; + PyObject *__pyx_v_options = 0; int __pyx_r; __Pyx_RefNannyDeclarations __Pyx_RefNannySetupContext("__cinit__ (wrapper)", 0); { - static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_html,0}; + static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_options,0}; PyObject* values[1] = {0}; + values[0] = ((PyObject*)Py_None); if (unlikely(__pyx_kwds)) { Py_ssize_t kw_args; const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args); @@ -925,1739 +1323,7512 @@ static int __pyx_pw_7gumbocy_10HTMLParser_1__cinit__(PyObject *__pyx_v_self, PyO kw_args = PyDict_Size(__pyx_kwds); switch (pos_args) { case 0: - if (likely((values[0] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_html)) != 0)) kw_args--; - else goto __pyx_L5_argtuple_error; - } - if (unlikely(kw_args > 0)) { - if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "__cinit__") < 0)) __PYX_ERR(0, 32, __pyx_L3_error) - } - } else if (PyTuple_GET_SIZE(__pyx_args) != 1) { - goto __pyx_L5_argtuple_error; + if (kw_args > 0) { + PyObject* value = PyDict_GetItem(__pyx_kwds, __pyx_n_s_options); + if (value) { values[0] = value; kw_args--; } + } + } + if (unlikely(kw_args > 0)) { + if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "__cinit__") < 0)) __PYX_ERR(0, 112, __pyx_L3_error) + } } else { - values[0] = PyTuple_GET_ITEM(__pyx_args, 0); + switch (PyTuple_GET_SIZE(__pyx_args)) { + case 1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0); + case 0: break; + default: goto __pyx_L5_argtuple_error; + } } - __pyx_v_html = __Pyx_PyObject_AsString(values[0]); if (unlikely((!__pyx_v_html) && PyErr_Occurred())) __PYX_ERR(0, 32, __pyx_L3_error) + __pyx_v_options = ((PyObject*)values[0]); } goto __pyx_L4_argument_unpacking_done; __pyx_L5_argtuple_error:; - __Pyx_RaiseArgtupleInvalid("__cinit__", 1, 1, 1, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 32, __pyx_L3_error) + __Pyx_RaiseArgtupleInvalid("__cinit__", 0, 0, 1, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 112, __pyx_L3_error) __pyx_L3_error:; __Pyx_AddTraceback("gumbocy.HTMLParser.__cinit__", __pyx_clineno, __pyx_lineno, __pyx_filename); __Pyx_RefNannyFinishContext(); return -1; __pyx_L4_argument_unpacking_done:; - __pyx_r = __pyx_pf_7gumbocy_10HTMLParser___cinit__(((struct __pyx_obj_7gumbocy_HTMLParser *)__pyx_v_self), __pyx_v_html); + if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_options), (&PyDict_Type), 1, "options", 1))) __PYX_ERR(0, 112, __pyx_L1_error) + __pyx_r = __pyx_pf_7gumbocy_10HTMLParser___cinit__(((struct __pyx_obj_7gumbocy_HTMLParser *)__pyx_v_self), __pyx_v_options); /* function exit code */ + goto __pyx_L0; + __pyx_L1_error:; + __pyx_r = -1; + __pyx_L0:; __Pyx_RefNannyFinishContext(); return __pyx_r; } -static int __pyx_pf_7gumbocy_10HTMLParser___cinit__(struct __pyx_obj_7gumbocy_HTMLParser *__pyx_v_self, char *__pyx_v_html) { +static int __pyx_pf_7gumbocy_10HTMLParser___cinit__(struct __pyx_obj_7gumbocy_HTMLParser *__pyx_v_self, PyObject *__pyx_v_options) { + PyObject *__pyx_v_attributes_whitelist = NULL; + PyObject *__pyx_v_classes_ignore = NULL; + PyObject *__pyx_v_ids_ignore = NULL; + PyObject *__pyx_v_classes_hidden = NULL; + PyObject *__pyx_v_ids_hidden = NULL; + PyObject *__pyx_v_classes_boilerplate = NULL; + PyObject *__pyx_v_ids_boilerplate = NULL; + PyObject *__pyx_v_roles_boilerplate = NULL; + PyObject *__pyx_v_metas_whitelist = NULL; + PyObject *__pyx_v_tag_name = NULL; + GumboTag __pyx_v_tag; int __pyx_r; __Pyx_RefNannyDeclarations + PyObject *__pyx_t_1 = NULL; + int __pyx_t_2; + PyObject *__pyx_t_3 = NULL; + int __pyx_t_4; + Py_ssize_t __pyx_t_5; + char const *__pyx_t_6; + re2::RE2 *__pyx_t_7; + int __pyx_t_8; + int __pyx_t_9; + PyObject *(*__pyx_t_10)(PyObject *); + char const *__pyx_t_11; __Pyx_RefNannySetupContext("__cinit__", 0); + __Pyx_INCREF(__pyx_v_options); - /* "gumbocy.pyx":33 + /* "gumbocy.pyx":114 + * def __cinit__(self, dict options=None): * - * def __cinit__(self, char* html): - * self.html = html # <<<<<<<<<<<<<< + * options = options or {} # <<<<<<<<<<<<<< * - * cdef bint _traverse_node(self, int level, gumbocy.GumboNode* node): + * self.nesting_limit = options.get("nesting_limit", 999) */ - __pyx_v_self->html = __pyx_v_html; + __pyx_t_2 = __Pyx_PyObject_IsTrue(__pyx_v_options); if (unlikely(__pyx_t_2 < 0)) __PYX_ERR(0, 114, __pyx_L1_error) + if (!__pyx_t_2) { + } else { + __Pyx_INCREF(__pyx_v_options); + __pyx_t_1 = __pyx_v_options; + goto __pyx_L3_bool_binop_done; + } + __pyx_t_3 = PyDict_New(); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 114, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __Pyx_INCREF(__pyx_t_3); + __pyx_t_1 = __pyx_t_3; + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __pyx_L3_bool_binop_done:; + __Pyx_DECREF_SET(__pyx_v_options, ((PyObject*)__pyx_t_1)); + __pyx_t_1 = 0; - /* "gumbocy.pyx":32 - * + /* "gumbocy.pyx":116 + * options = options or {} * - * def __cinit__(self, char* html): # <<<<<<<<<<<<<< - * self.html = html + * self.nesting_limit = options.get("nesting_limit", 999) # <<<<<<<<<<<<<< + * self.head_only = options.get("head_only") * */ + if (unlikely(__pyx_v_options == Py_None)) { + PyErr_Format(PyExc_AttributeError, "'NoneType' object has no attribute '%s'", "get"); + __PYX_ERR(0, 116, __pyx_L1_error) + } + __pyx_t_1 = __Pyx_PyDict_GetItemDefault(__pyx_v_options, __pyx_n_s_nesting_limit, __pyx_int_999); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 116, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __pyx_t_4 = __Pyx_PyInt_As_int(__pyx_t_1); if (unlikely((__pyx_t_4 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 116, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __pyx_v_self->nesting_limit = __pyx_t_4; - /* function exit code */ - __pyx_r = 0; - __Pyx_RefNannyFinishContext(); - return __pyx_r; -} - -/* "gumbocy.pyx":35 - * self.html = html + /* "gumbocy.pyx":117 * - * cdef bint _traverse_node(self, int level, gumbocy.GumboNode* node): # <<<<<<<<<<<<<< - * """ Traverses the node tree. Return 1 to stop at this level """ + * self.nesting_limit = options.get("nesting_limit", 999) + * self.head_only = options.get("head_only") # <<<<<<<<<<<<<< * + * self.analyze_external_hyperlinks = bool(options.get("analyze_external_hyperlinks", True)) */ + if (unlikely(__pyx_v_options == Py_None)) { + PyErr_Format(PyExc_AttributeError, "'NoneType' object has no attribute '%s'", "get"); + __PYX_ERR(0, 117, __pyx_L1_error) + } + __pyx_t_1 = __Pyx_PyDict_GetItemDefault(__pyx_v_options, __pyx_n_s_head_only, Py_None); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 117, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __pyx_t_2 = __Pyx_PyObject_IsTrue(__pyx_t_1); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 117, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __pyx_v_self->head_only = __pyx_t_2; -static int __pyx_f_7gumbocy_10HTMLParser__traverse_node(struct __pyx_obj_7gumbocy_HTMLParser *__pyx_v_self, int __pyx_v_level, GumboNode *__pyx_v_node) { - GumboStringPiece __pyx_v_gsp; - int __pyx_v_tag_n; - char const *__pyx_v_tag_name; - PyObject *__pyx_v_py_tag_name = NULL; - int __pyx_v_has_attrs; - PyObject *__pyx_v_attrs = NULL; - unsigned int __pyx_v_i; - GumboAttribute *__pyx_v_attr; - PyObject *__pyx_v_attr_name = NULL; - PyObject *__pyx_v_multiple_value = NULL; - PyObject *__pyx_v_v = NULL; - GumboNode *__pyx_v_child; - int __pyx_r; - __Pyx_RefNannyDeclarations - int __pyx_t_1; - PyObject *__pyx_t_2 = NULL; - PyObject *__pyx_t_3 = NULL; - PyObject *__pyx_t_4 = NULL; - int __pyx_t_5; - int __pyx_t_6; - GumboStringPiece __pyx_t_7; - char const *__pyx_t_8; - unsigned int __pyx_t_9; - unsigned int __pyx_t_10; - PyObject *__pyx_t_11 = NULL; - PyObject *__pyx_t_12 = NULL; - PyObject *__pyx_t_13 = NULL; - Py_ssize_t __pyx_t_14; - PyObject *(*__pyx_t_15)(PyObject *); - int __pyx_t_16; - __Pyx_RefNannySetupContext("_traverse_node", 0); + /* "gumbocy.pyx":119 + * self.head_only = options.get("head_only") + * + * self.analyze_external_hyperlinks = bool(options.get("analyze_external_hyperlinks", True)) # <<<<<<<<<<<<<< + * self.analyze_internal_hyperlinks = bool(options.get("analyze_internal_hyperlinks", True)) + * self.analyze_word_groups = bool(options.get("analyze_word_groups", True)) + */ + if (unlikely(__pyx_v_options == Py_None)) { + PyErr_Format(PyExc_AttributeError, "'NoneType' object has no attribute '%s'", "get"); + __PYX_ERR(0, 119, __pyx_L1_error) + } + __pyx_t_1 = __Pyx_PyDict_GetItemDefault(__pyx_v_options, __pyx_n_s_analyze_external_hyperlinks, Py_True); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 119, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __pyx_t_2 = __Pyx_PyObject_IsTrue(__pyx_t_1); if (unlikely(__pyx_t_2 < 0)) __PYX_ERR(0, 119, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __pyx_v_self->analyze_external_hyperlinks = (!(!__pyx_t_2)); - /* "gumbocy.pyx":40 - * cdef GumboStringPiece gsp + /* "gumbocy.pyx":120 * - * if level > self.nesting_limit: # <<<<<<<<<<<<<< - * return 0 + * self.analyze_external_hyperlinks = bool(options.get("analyze_external_hyperlinks", True)) + * self.analyze_internal_hyperlinks = bool(options.get("analyze_internal_hyperlinks", True)) # <<<<<<<<<<<<<< + * self.analyze_word_groups = bool(options.get("analyze_word_groups", True)) * */ - __pyx_t_1 = ((__pyx_v_level > __pyx_v_self->nesting_limit) != 0); - if (__pyx_t_1) { + if (unlikely(__pyx_v_options == Py_None)) { + PyErr_Format(PyExc_AttributeError, "'NoneType' object has no attribute '%s'", "get"); + __PYX_ERR(0, 120, __pyx_L1_error) + } + __pyx_t_1 = __Pyx_PyDict_GetItemDefault(__pyx_v_options, __pyx_n_s_analyze_internal_hyperlinks, Py_True); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 120, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __pyx_t_2 = __Pyx_PyObject_IsTrue(__pyx_t_1); if (unlikely(__pyx_t_2 < 0)) __PYX_ERR(0, 120, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __pyx_v_self->analyze_internal_hyperlinks = (!(!__pyx_t_2)); - /* "gumbocy.pyx":41 - * - * if level > self.nesting_limit: - * return 0 # <<<<<<<<<<<<<< + /* "gumbocy.pyx":121 + * self.analyze_external_hyperlinks = bool(options.get("analyze_external_hyperlinks", True)) + * self.analyze_internal_hyperlinks = bool(options.get("analyze_internal_hyperlinks", True)) + * self.analyze_word_groups = bool(options.get("analyze_word_groups", True)) # <<<<<<<<<<<<<< * - * if node.type == gumbocy.GUMBO_NODE_TEXT: + * attributes_whitelist = set(options.get("attributes_whitelist") or []) */ - __pyx_r = 0; - goto __pyx_L0; + if (unlikely(__pyx_v_options == Py_None)) { + PyErr_Format(PyExc_AttributeError, "'NoneType' object has no attribute '%s'", "get"); + __PYX_ERR(0, 121, __pyx_L1_error) + } + __pyx_t_1 = __Pyx_PyDict_GetItemDefault(__pyx_v_options, __pyx_n_s_analyze_word_groups, Py_True); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 121, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __pyx_t_2 = __Pyx_PyObject_IsTrue(__pyx_t_1); if (unlikely(__pyx_t_2 < 0)) __PYX_ERR(0, 121, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __pyx_v_self->analyze_word_groups = (!(!__pyx_t_2)); - /* "gumbocy.pyx":40 - * cdef GumboStringPiece gsp + /* "gumbocy.pyx":123 + * self.analyze_word_groups = bool(options.get("analyze_word_groups", True)) * - * if level > self.nesting_limit: # <<<<<<<<<<<<<< - * return 0 + * attributes_whitelist = set(options.get("attributes_whitelist") or []) # <<<<<<<<<<<<<< * + * classes_ignore = frozenset(options.get("classes_ignore") or []) */ + if (unlikely(__pyx_v_options == Py_None)) { + PyErr_Format(PyExc_AttributeError, "'NoneType' object has no attribute '%s'", "get"); + __PYX_ERR(0, 123, __pyx_L1_error) + } + __pyx_t_3 = __Pyx_PyDict_GetItemDefault(__pyx_v_options, __pyx_n_s_attributes_whitelist, Py_None); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 123, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __pyx_t_2 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_2 < 0)) __PYX_ERR(0, 123, __pyx_L1_error) + if (!__pyx_t_2) { + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + } else { + __Pyx_INCREF(__pyx_t_3); + __pyx_t_1 = __pyx_t_3; + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + goto __pyx_L5_bool_binop_done; } + __pyx_t_3 = PyList_New(0); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 123, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __Pyx_INCREF(__pyx_t_3); + __pyx_t_1 = __pyx_t_3; + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __pyx_L5_bool_binop_done:; + __pyx_t_3 = PySet_New(__pyx_t_1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 123, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __pyx_v_attributes_whitelist = ((PyObject*)__pyx_t_3); + __pyx_t_3 = 0; - /* "gumbocy.pyx":43 - * return 0 - * - * if node.type == gumbocy.GUMBO_NODE_TEXT: # <<<<<<<<<<<<<< - * self.nodes.append((level, None, node.v.text.text)) + /* "gumbocy.pyx":125 + * attributes_whitelist = set(options.get("attributes_whitelist") or []) * + * classes_ignore = frozenset(options.get("classes_ignore") or []) # <<<<<<<<<<<<<< + * if len(classes_ignore) > 0: + * self.has_classes_ignore = True */ - switch (__pyx_v_node->type) { - case GUMBO_NODE_TEXT: + if (unlikely(__pyx_v_options == Py_None)) { + PyErr_Format(PyExc_AttributeError, "'NoneType' object has no attribute '%s'", "get"); + __PYX_ERR(0, 125, __pyx_L1_error) + } + __pyx_t_1 = __Pyx_PyDict_GetItemDefault(__pyx_v_options, __pyx_n_s_classes_ignore, Py_None); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 125, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __pyx_t_2 = __Pyx_PyObject_IsTrue(__pyx_t_1); if (unlikely(__pyx_t_2 < 0)) __PYX_ERR(0, 125, __pyx_L1_error) + if (!__pyx_t_2) { + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + } else { + __Pyx_INCREF(__pyx_t_1); + __pyx_t_3 = __pyx_t_1; + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + goto __pyx_L7_bool_binop_done; + } + __pyx_t_1 = PyList_New(0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 125, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_INCREF(__pyx_t_1); + __pyx_t_3 = __pyx_t_1; + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __pyx_L7_bool_binop_done:; + __pyx_t_1 = __Pyx_PyFrozenSet_New(__pyx_t_3); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 125, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __pyx_v_classes_ignore = ((PyObject*)__pyx_t_1); + __pyx_t_1 = 0; - /* "gumbocy.pyx":44 + /* "gumbocy.pyx":126 * - * if node.type == gumbocy.GUMBO_NODE_TEXT: - * self.nodes.append((level, None, node.v.text.text)) # <<<<<<<<<<<<<< + * classes_ignore = frozenset(options.get("classes_ignore") or []) + * if len(classes_ignore) > 0: # <<<<<<<<<<<<<< + * self.has_classes_ignore = True + * self.classes_ignore = new re2cy.RE2("^(?:" + "|".join(classes_ignore) + ")$") + */ + __pyx_t_5 = PySet_GET_SIZE(__pyx_v_classes_ignore); if (unlikely(__pyx_t_5 == -1)) __PYX_ERR(0, 126, __pyx_L1_error) + __pyx_t_2 = ((__pyx_t_5 > 0) != 0); + if (__pyx_t_2) { + + /* "gumbocy.pyx":127 + * classes_ignore = frozenset(options.get("classes_ignore") or []) + * if len(classes_ignore) > 0: + * self.has_classes_ignore = True # <<<<<<<<<<<<<< + * self.classes_ignore = new re2cy.RE2("^(?:" + "|".join(classes_ignore) + ")$") + * attributes_whitelist.add("class") + */ + __pyx_v_self->has_classes_ignore = 1; + + /* "gumbocy.pyx":128 + * if len(classes_ignore) > 0: + * self.has_classes_ignore = True + * self.classes_ignore = new re2cy.RE2("^(?:" + "|".join(classes_ignore) + ")$") # <<<<<<<<<<<<<< + * attributes_whitelist.add("class") * - * elif node.type == gumbocy.GUMBO_NODE_ELEMENT: */ - if (unlikely(__pyx_v_self->nodes == Py_None)) { - PyErr_Format(PyExc_AttributeError, "'NoneType' object has no attribute '%s'", "append"); - __PYX_ERR(0, 44, __pyx_L1_error) - } - __pyx_t_2 = __Pyx_PyInt_From_int(__pyx_v_level); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 44, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_2); - __pyx_t_3 = __Pyx_PyBytes_FromString(__pyx_v_node->v.text.text); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 44, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyString_Join(__pyx_kp_s__2, __pyx_v_classes_ignore); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 128, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __pyx_t_3 = PyNumber_Add(__pyx_kp_s_, __pyx_t_1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 128, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); - __pyx_t_4 = PyTuple_New(3); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 44, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_4); - __Pyx_GIVEREF(__pyx_t_2); - PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_2); - __Pyx_INCREF(Py_None); - __Pyx_GIVEREF(Py_None); - PyTuple_SET_ITEM(__pyx_t_4, 1, Py_None); - __Pyx_GIVEREF(__pyx_t_3); - PyTuple_SET_ITEM(__pyx_t_4, 2, __pyx_t_3); - __pyx_t_2 = 0; - __pyx_t_3 = 0; - __pyx_t_5 = __Pyx_PyList_Append(__pyx_v_self->nodes, __pyx_t_4); if (unlikely(__pyx_t_5 == -1)) __PYX_ERR(0, 44, __pyx_L1_error) - __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __pyx_t_1 = PyNumber_Add(__pyx_t_3, __pyx_kp_s__3); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 128, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __pyx_t_6 = __Pyx_PyObject_AsString(__pyx_t_1); if (unlikely((!__pyx_t_6) && PyErr_Occurred())) __PYX_ERR(0, 128, __pyx_L1_error) + try { + __pyx_t_7 = new re2::RE2(__pyx_t_6); + } catch(...) { + __Pyx_CppExn2PyErr(); + __PYX_ERR(0, 128, __pyx_L1_error) + } + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __pyx_v_self->classes_ignore = __pyx_t_7; - /* "gumbocy.pyx":43 - * return 0 - * - * if node.type == gumbocy.GUMBO_NODE_TEXT: # <<<<<<<<<<<<<< - * self.nodes.append((level, None, node.v.text.text)) + /* "gumbocy.pyx":129 + * self.has_classes_ignore = True + * self.classes_ignore = new re2cy.RE2("^(?:" + "|".join(classes_ignore) + ")$") + * attributes_whitelist.add("class") # <<<<<<<<<<<<<< * + * ids_ignore = frozenset(options.get("ids_ignore") or []) */ - break; + __pyx_t_8 = PySet_Add(__pyx_v_attributes_whitelist, __pyx_n_s_class); if (unlikely(__pyx_t_8 == -1)) __PYX_ERR(0, 129, __pyx_L1_error) - /* "gumbocy.pyx":46 - * self.nodes.append((level, None, node.v.text.text)) - * - * elif node.type == gumbocy.GUMBO_NODE_ELEMENT: # <<<<<<<<<<<<<< + /* "gumbocy.pyx":126 * - * tag_n = node.v.element.tag + * classes_ignore = frozenset(options.get("classes_ignore") or []) + * if len(classes_ignore) > 0: # <<<<<<<<<<<<<< + * self.has_classes_ignore = True + * self.classes_ignore = new re2cy.RE2("^(?:" + "|".join(classes_ignore) + ")$") */ - case GUMBO_NODE_ELEMENT: + } - /* "gumbocy.pyx":48 - * elif node.type == gumbocy.GUMBO_NODE_ELEMENT: - * - * tag_n = node.v.element.tag # <<<<<<<<<<<<<< + /* "gumbocy.pyx":131 + * attributes_whitelist.add("class") * - * if self.head_only and self.tags_ignore_head_only.count(tag_n): + * ids_ignore = frozenset(options.get("ids_ignore") or []) # <<<<<<<<<<<<<< + * if len(ids_ignore) > 0: + * self.has_ids_ignore = True */ - __pyx_v_tag_n = ((int)__pyx_v_node->v.element.tag); + if (unlikely(__pyx_v_options == Py_None)) { + PyErr_Format(PyExc_AttributeError, "'NoneType' object has no attribute '%s'", "get"); + __PYX_ERR(0, 131, __pyx_L1_error) + } + __pyx_t_3 = __Pyx_PyDict_GetItemDefault(__pyx_v_options, __pyx_n_s_ids_ignore, Py_None); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 131, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __pyx_t_2 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_2 < 0)) __PYX_ERR(0, 131, __pyx_L1_error) + if (!__pyx_t_2) { + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + } else { + __Pyx_INCREF(__pyx_t_3); + __pyx_t_1 = __pyx_t_3; + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + goto __pyx_L10_bool_binop_done; + } + __pyx_t_3 = PyList_New(0); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 131, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __Pyx_INCREF(__pyx_t_3); + __pyx_t_1 = __pyx_t_3; + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __pyx_L10_bool_binop_done:; + __pyx_t_3 = __Pyx_PyFrozenSet_New(__pyx_t_1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 131, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __pyx_v_ids_ignore = ((PyObject*)__pyx_t_3); + __pyx_t_3 = 0; - /* "gumbocy.pyx":50 - * tag_n = node.v.element.tag - * - * if self.head_only and self.tags_ignore_head_only.count(tag_n): # <<<<<<<<<<<<<< - * return 1 + /* "gumbocy.pyx":132 * + * ids_ignore = frozenset(options.get("ids_ignore") or []) + * if len(ids_ignore) > 0: # <<<<<<<<<<<<<< + * self.has_ids_ignore = True + * self.ids_ignore = new re2cy.RE2("^(?:" + "|".join(ids_ignore) + ")$") */ - __pyx_t_6 = (__pyx_v_self->head_only != 0); - if (__pyx_t_6) { - } else { - __pyx_t_1 = __pyx_t_6; - goto __pyx_L5_bool_binop_done; - } - __pyx_t_6 = (__pyx_v_self->tags_ignore_head_only.count(__pyx_v_tag_n) != 0); - __pyx_t_1 = __pyx_t_6; - __pyx_L5_bool_binop_done:; - if (__pyx_t_1) { + __pyx_t_5 = PySet_GET_SIZE(__pyx_v_ids_ignore); if (unlikely(__pyx_t_5 == -1)) __PYX_ERR(0, 132, __pyx_L1_error) + __pyx_t_2 = ((__pyx_t_5 > 0) != 0); + if (__pyx_t_2) { - /* "gumbocy.pyx":51 - * - * if self.head_only and self.tags_ignore_head_only.count(tag_n): - * return 1 # <<<<<<<<<<<<<< - * - * if self.tags_ignore.count(tag_n): + /* "gumbocy.pyx":133 + * ids_ignore = frozenset(options.get("ids_ignore") or []) + * if len(ids_ignore) > 0: + * self.has_ids_ignore = True # <<<<<<<<<<<<<< + * self.ids_ignore = new re2cy.RE2("^(?:" + "|".join(ids_ignore) + ")$") + * attributes_whitelist.add("id") */ - __pyx_r = 1; - goto __pyx_L0; + __pyx_v_self->has_ids_ignore = 1; - /* "gumbocy.pyx":50 - * tag_n = node.v.element.tag - * - * if self.head_only and self.tags_ignore_head_only.count(tag_n): # <<<<<<<<<<<<<< - * return 1 + /* "gumbocy.pyx":134 + * if len(ids_ignore) > 0: + * self.has_ids_ignore = True + * self.ids_ignore = new re2cy.RE2("^(?:" + "|".join(ids_ignore) + ")$") # <<<<<<<<<<<<<< + * attributes_whitelist.add("id") * */ + __pyx_t_3 = __Pyx_PyString_Join(__pyx_kp_s__2, __pyx_v_ids_ignore); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 134, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __pyx_t_1 = PyNumber_Add(__pyx_kp_s_, __pyx_t_3); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 134, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __pyx_t_3 = PyNumber_Add(__pyx_t_1, __pyx_kp_s__3); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 134, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __pyx_t_6 = __Pyx_PyObject_AsString(__pyx_t_3); if (unlikely((!__pyx_t_6) && PyErr_Occurred())) __PYX_ERR(0, 134, __pyx_L1_error) + try { + __pyx_t_7 = new re2::RE2(__pyx_t_6); + } catch(...) { + __Pyx_CppExn2PyErr(); + __PYX_ERR(0, 134, __pyx_L1_error) } + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __pyx_v_self->ids_ignore = __pyx_t_7; - /* "gumbocy.pyx":53 - * return 1 - * - * if self.tags_ignore.count(tag_n): # <<<<<<<<<<<<<< - * return 0 + /* "gumbocy.pyx":135 + * self.has_ids_ignore = True + * self.ids_ignore = new re2cy.RE2("^(?:" + "|".join(ids_ignore) + ")$") + * attributes_whitelist.add("id") # <<<<<<<<<<<<<< * + * classes_hidden = frozenset(options.get("classes_hidden") or []) */ - __pyx_t_1 = (__pyx_v_self->tags_ignore.count(__pyx_v_tag_n) != 0); - if (__pyx_t_1) { + __pyx_t_8 = PySet_Add(__pyx_v_attributes_whitelist, __pyx_n_s_id); if (unlikely(__pyx_t_8 == -1)) __PYX_ERR(0, 135, __pyx_L1_error) - /* "gumbocy.pyx":54 + /* "gumbocy.pyx":132 * - * if self.tags_ignore.count(tag_n): - * return 0 # <<<<<<<<<<<<<< + * ids_ignore = frozenset(options.get("ids_ignore") or []) + * if len(ids_ignore) > 0: # <<<<<<<<<<<<<< + * self.has_ids_ignore = True + * self.ids_ignore = new re2cy.RE2("^(?:" + "|".join(ids_ignore) + ")$") + */ + } + + /* "gumbocy.pyx":137 + * attributes_whitelist.add("id") * - * tag_name = gumbocy.gumbo_normalized_tagname(node.v.element.tag) + * classes_hidden = frozenset(options.get("classes_hidden") or []) # <<<<<<<<<<<<<< + * if len(classes_hidden) > 0: + * self.has_classes_hidden = True */ - __pyx_r = 0; - goto __pyx_L0; + if (unlikely(__pyx_v_options == Py_None)) { + PyErr_Format(PyExc_AttributeError, "'NoneType' object has no attribute '%s'", "get"); + __PYX_ERR(0, 137, __pyx_L1_error) + } + __pyx_t_1 = __Pyx_PyDict_GetItemDefault(__pyx_v_options, __pyx_n_s_classes_hidden, Py_None); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 137, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __pyx_t_2 = __Pyx_PyObject_IsTrue(__pyx_t_1); if (unlikely(__pyx_t_2 < 0)) __PYX_ERR(0, 137, __pyx_L1_error) + if (!__pyx_t_2) { + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + } else { + __Pyx_INCREF(__pyx_t_1); + __pyx_t_3 = __pyx_t_1; + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + goto __pyx_L13_bool_binop_done; + } + __pyx_t_1 = PyList_New(0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 137, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_INCREF(__pyx_t_1); + __pyx_t_3 = __pyx_t_1; + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __pyx_L13_bool_binop_done:; + __pyx_t_1 = __Pyx_PyFrozenSet_New(__pyx_t_3); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 137, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __pyx_v_classes_hidden = ((PyObject*)__pyx_t_1); + __pyx_t_1 = 0; - /* "gumbocy.pyx":53 - * return 1 + /* "gumbocy.pyx":138 * - * if self.tags_ignore.count(tag_n): # <<<<<<<<<<<<<< - * return 0 + * classes_hidden = frozenset(options.get("classes_hidden") or []) + * if len(classes_hidden) > 0: # <<<<<<<<<<<<<< + * self.has_classes_hidden = True + * self.classes_hidden = new re2cy.RE2("^(?:" + "|".join(classes_hidden) + ")$") + */ + __pyx_t_5 = PySet_GET_SIZE(__pyx_v_classes_hidden); if (unlikely(__pyx_t_5 == -1)) __PYX_ERR(0, 138, __pyx_L1_error) + __pyx_t_2 = ((__pyx_t_5 > 0) != 0); + if (__pyx_t_2) { + + /* "gumbocy.pyx":139 + * classes_hidden = frozenset(options.get("classes_hidden") or []) + * if len(classes_hidden) > 0: + * self.has_classes_hidden = True # <<<<<<<<<<<<<< + * self.classes_hidden = new re2cy.RE2("^(?:" + "|".join(classes_hidden) + ")$") + * attributes_whitelist.add("class") + */ + __pyx_v_self->has_classes_hidden = 1; + + /* "gumbocy.pyx":140 + * if len(classes_hidden) > 0: + * self.has_classes_hidden = True + * self.classes_hidden = new re2cy.RE2("^(?:" + "|".join(classes_hidden) + ")$") # <<<<<<<<<<<<<< + * attributes_whitelist.add("class") * */ + __pyx_t_1 = __Pyx_PyString_Join(__pyx_kp_s__2, __pyx_v_classes_hidden); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 140, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __pyx_t_3 = PyNumber_Add(__pyx_kp_s_, __pyx_t_1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 140, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __pyx_t_1 = PyNumber_Add(__pyx_t_3, __pyx_kp_s__3); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 140, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __pyx_t_6 = __Pyx_PyObject_AsString(__pyx_t_1); if (unlikely((!__pyx_t_6) && PyErr_Occurred())) __PYX_ERR(0, 140, __pyx_L1_error) + try { + __pyx_t_7 = new re2::RE2(__pyx_t_6); + } catch(...) { + __Pyx_CppExn2PyErr(); + __PYX_ERR(0, 140, __pyx_L1_error) } + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __pyx_v_self->classes_hidden = __pyx_t_7; - /* "gumbocy.pyx":56 - * return 0 + /* "gumbocy.pyx":141 + * self.has_classes_hidden = True + * self.classes_hidden = new re2cy.RE2("^(?:" + "|".join(classes_hidden) + ")$") + * attributes_whitelist.add("class") # <<<<<<<<<<<<<< * - * tag_name = gumbocy.gumbo_normalized_tagname(node.v.element.tag) # <<<<<<<<<<<<<< + * ids_hidden = frozenset(options.get("ids_hidden") or []) + */ + __pyx_t_8 = PySet_Add(__pyx_v_attributes_whitelist, __pyx_n_s_class); if (unlikely(__pyx_t_8 == -1)) __PYX_ERR(0, 141, __pyx_L1_error) + + /* "gumbocy.pyx":138 * - * # When we find an unknown tag, find its tag_name in the buffer + * classes_hidden = frozenset(options.get("classes_hidden") or []) + * if len(classes_hidden) > 0: # <<<<<<<<<<<<<< + * self.has_classes_hidden = True + * self.classes_hidden = new re2cy.RE2("^(?:" + "|".join(classes_hidden) + ")$") */ - __pyx_v_tag_name = gumbo_normalized_tagname(__pyx_v_node->v.element.tag); + } - /* "gumbocy.pyx":59 + /* "gumbocy.pyx":143 + * attributes_whitelist.add("class") * - * # When we find an unknown tag, find its tag_name in the buffer - * if tag_name == b"": # <<<<<<<<<<<<<< - * gsp = node.v.element.original_tag - * gumbo_tag_from_original_text(&gsp) + * ids_hidden = frozenset(options.get("ids_hidden") or []) # <<<<<<<<<<<<<< + * if len(ids_hidden) > 0: + * self.has_ids_hidden = True */ - __pyx_t_4 = __Pyx_PyBytes_FromString(__pyx_v_tag_name); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 59, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_4); - __pyx_t_1 = (__Pyx_PyBytes_Equals(__pyx_t_4, __pyx_kp_b_, Py_EQ)); if (unlikely(__pyx_t_1 < 0)) __PYX_ERR(0, 59, __pyx_L1_error) - __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; - if (__pyx_t_1) { + if (unlikely(__pyx_v_options == Py_None)) { + PyErr_Format(PyExc_AttributeError, "'NoneType' object has no attribute '%s'", "get"); + __PYX_ERR(0, 143, __pyx_L1_error) + } + __pyx_t_3 = __Pyx_PyDict_GetItemDefault(__pyx_v_options, __pyx_n_s_ids_hidden, Py_None); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 143, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __pyx_t_2 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_2 < 0)) __PYX_ERR(0, 143, __pyx_L1_error) + if (!__pyx_t_2) { + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + } else { + __Pyx_INCREF(__pyx_t_3); + __pyx_t_1 = __pyx_t_3; + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + goto __pyx_L16_bool_binop_done; + } + __pyx_t_3 = PyList_New(0); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 143, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __Pyx_INCREF(__pyx_t_3); + __pyx_t_1 = __pyx_t_3; + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __pyx_L16_bool_binop_done:; + __pyx_t_3 = __Pyx_PyFrozenSet_New(__pyx_t_1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 143, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __pyx_v_ids_hidden = ((PyObject*)__pyx_t_3); + __pyx_t_3 = 0; - /* "gumbocy.pyx":60 - * # When we find an unknown tag, find its tag_name in the buffer - * if tag_name == b"": - * gsp = node.v.element.original_tag # <<<<<<<<<<<<<< - * gumbo_tag_from_original_text(&gsp) - * py_tag_name = str(gsp.data)[0:gsp.length].lower() # TODO try to do that only in C! + /* "gumbocy.pyx":144 + * + * ids_hidden = frozenset(options.get("ids_hidden") or []) + * if len(ids_hidden) > 0: # <<<<<<<<<<<<<< + * self.has_ids_hidden = True + * self.ids_hidden = new re2cy.RE2("^(?:" + "|".join(ids_hidden) + ")$") */ - __pyx_t_7 = __pyx_v_node->v.element.original_tag; - __pyx_v_gsp = __pyx_t_7; + __pyx_t_5 = PySet_GET_SIZE(__pyx_v_ids_hidden); if (unlikely(__pyx_t_5 == -1)) __PYX_ERR(0, 144, __pyx_L1_error) + __pyx_t_2 = ((__pyx_t_5 > 0) != 0); + if (__pyx_t_2) { - /* "gumbocy.pyx":61 - * if tag_name == b"": - * gsp = node.v.element.original_tag - * gumbo_tag_from_original_text(&gsp) # <<<<<<<<<<<<<< - * py_tag_name = str(gsp.data)[0:gsp.length].lower() # TODO try to do that only in C! - * tag_name = py_tag_name + /* "gumbocy.pyx":145 + * ids_hidden = frozenset(options.get("ids_hidden") or []) + * if len(ids_hidden) > 0: + * self.has_ids_hidden = True # <<<<<<<<<<<<<< + * self.ids_hidden = new re2cy.RE2("^(?:" + "|".join(ids_hidden) + ")$") + * attributes_whitelist.add("id") */ - gumbo_tag_from_original_text((&__pyx_v_gsp)); + __pyx_v_self->has_ids_hidden = 1; - /* "gumbocy.pyx":62 - * gsp = node.v.element.original_tag - * gumbo_tag_from_original_text(&gsp) - * py_tag_name = str(gsp.data)[0:gsp.length].lower() # TODO try to do that only in C! # <<<<<<<<<<<<<< - * tag_name = py_tag_name + /* "gumbocy.pyx":146 + * if len(ids_hidden) > 0: + * self.has_ids_hidden = True + * self.ids_hidden = new re2cy.RE2("^(?:" + "|".join(ids_hidden) + ")$") # <<<<<<<<<<<<<< + * attributes_whitelist.add("id") * */ - __pyx_t_3 = __Pyx_PyBytes_FromString(__pyx_v_gsp.data); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 62, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_3); - __pyx_t_2 = PyTuple_New(1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 62, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_2); - __Pyx_GIVEREF(__pyx_t_3); - PyTuple_SET_ITEM(__pyx_t_2, 0, __pyx_t_3); - __pyx_t_3 = 0; - __pyx_t_3 = __Pyx_PyObject_Call(((PyObject *)(&PyString_Type)), __pyx_t_2, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 62, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_3); - __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; - __pyx_t_2 = __Pyx_PyObject_GetSlice(__pyx_t_3, 0, __pyx_v_gsp.length, NULL, NULL, NULL, 1, 1, 1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 62, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_2); - __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_lower); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 62, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_3); - __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; - __pyx_t_2 = NULL; - if (CYTHON_COMPILING_IN_CPYTHON && likely(PyMethod_Check(__pyx_t_3))) { - __pyx_t_2 = PyMethod_GET_SELF(__pyx_t_3); - if (likely(__pyx_t_2)) { - PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_3); - __Pyx_INCREF(__pyx_t_2); - __Pyx_INCREF(function); - __Pyx_DECREF_SET(__pyx_t_3, function); - } - } - if (__pyx_t_2) { - __pyx_t_4 = __Pyx_PyObject_CallOneArg(__pyx_t_3, __pyx_t_2); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 62, __pyx_L1_error) - __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; - } else { - __pyx_t_4 = __Pyx_PyObject_CallNoArg(__pyx_t_3); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 62, __pyx_L1_error) - } - __Pyx_GOTREF(__pyx_t_4); - __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - __pyx_v_py_tag_name = __pyx_t_4; - __pyx_t_4 = 0; + __pyx_t_3 = __Pyx_PyString_Join(__pyx_kp_s__2, __pyx_v_ids_hidden); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 146, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __pyx_t_1 = PyNumber_Add(__pyx_kp_s_, __pyx_t_3); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 146, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __pyx_t_3 = PyNumber_Add(__pyx_t_1, __pyx_kp_s__3); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 146, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __pyx_t_6 = __Pyx_PyObject_AsString(__pyx_t_3); if (unlikely((!__pyx_t_6) && PyErr_Occurred())) __PYX_ERR(0, 146, __pyx_L1_error) + try { + __pyx_t_7 = new re2::RE2(__pyx_t_6); + } catch(...) { + __Pyx_CppExn2PyErr(); + __PYX_ERR(0, 146, __pyx_L1_error) + } + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __pyx_v_self->ids_hidden = __pyx_t_7; - /* "gumbocy.pyx":63 - * gumbo_tag_from_original_text(&gsp) - * py_tag_name = str(gsp.data)[0:gsp.length].lower() # TODO try to do that only in C! - * tag_name = py_tag_name # <<<<<<<<<<<<<< + /* "gumbocy.pyx":147 + * self.has_ids_hidden = True + * self.ids_hidden = new re2cy.RE2("^(?:" + "|".join(ids_hidden) + ")$") + * attributes_whitelist.add("id") # <<<<<<<<<<<<<< * - * if self.has_attributes_whitelist: + * classes_boilerplate = frozenset(options.get("classes_boilerplate") or []) */ - __pyx_t_8 = __Pyx_PyObject_AsString(__pyx_v_py_tag_name); if (unlikely((!__pyx_t_8) && PyErr_Occurred())) __PYX_ERR(0, 63, __pyx_L1_error) - __pyx_v_tag_name = ((char const *)__pyx_t_8); + __pyx_t_8 = PySet_Add(__pyx_v_attributes_whitelist, __pyx_n_s_id); if (unlikely(__pyx_t_8 == -1)) __PYX_ERR(0, 147, __pyx_L1_error) - /* "gumbocy.pyx":59 + /* "gumbocy.pyx":144 * - * # When we find an unknown tag, find its tag_name in the buffer - * if tag_name == b"": # <<<<<<<<<<<<<< - * gsp = node.v.element.original_tag - * gumbo_tag_from_original_text(&gsp) + * ids_hidden = frozenset(options.get("ids_hidden") or []) + * if len(ids_hidden) > 0: # <<<<<<<<<<<<<< + * self.has_ids_hidden = True + * self.ids_hidden = new re2cy.RE2("^(?:" + "|".join(ids_hidden) + ")$") */ - } + } - /* "gumbocy.pyx":65 - * tag_name = py_tag_name + /* "gumbocy.pyx":149 + * attributes_whitelist.add("id") * - * if self.has_attributes_whitelist: # <<<<<<<<<<<<<< - * - * # Build a dict with all the whitelisted attributes + * classes_boilerplate = frozenset(options.get("classes_boilerplate") or []) # <<<<<<<<<<<<<< + * if len(classes_boilerplate) > 0: + * self.has_classes_boilerplate = True */ - __pyx_t_1 = (__pyx_v_self->has_attributes_whitelist != 0); - if (__pyx_t_1) { + if (unlikely(__pyx_v_options == Py_None)) { + PyErr_Format(PyExc_AttributeError, "'NoneType' object has no attribute '%s'", "get"); + __PYX_ERR(0, 149, __pyx_L1_error) + } + __pyx_t_1 = __Pyx_PyDict_GetItemDefault(__pyx_v_options, __pyx_n_s_classes_boilerplate, Py_None); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 149, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __pyx_t_2 = __Pyx_PyObject_IsTrue(__pyx_t_1); if (unlikely(__pyx_t_2 < 0)) __PYX_ERR(0, 149, __pyx_L1_error) + if (!__pyx_t_2) { + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + } else { + __Pyx_INCREF(__pyx_t_1); + __pyx_t_3 = __pyx_t_1; + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + goto __pyx_L19_bool_binop_done; + } + __pyx_t_1 = PyList_New(0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 149, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_INCREF(__pyx_t_1); + __pyx_t_3 = __pyx_t_1; + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __pyx_L19_bool_binop_done:; + __pyx_t_1 = __Pyx_PyFrozenSet_New(__pyx_t_3); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 149, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __pyx_v_classes_boilerplate = ((PyObject*)__pyx_t_1); + __pyx_t_1 = 0; - /* "gumbocy.pyx":68 + /* "gumbocy.pyx":150 * - * # Build a dict with all the whitelisted attributes - * has_attrs = False # <<<<<<<<<<<<<< - * attrs = False - * for i in range(node.v.element.attributes.length): + * classes_boilerplate = frozenset(options.get("classes_boilerplate") or []) + * if len(classes_boilerplate) > 0: # <<<<<<<<<<<<<< + * self.has_classes_boilerplate = True + * self.classes_boilerplate = new re2cy.RE2("^(?:" + "|".join(classes_boilerplate) + ")$") */ - __pyx_v_has_attrs = 0; + __pyx_t_5 = PySet_GET_SIZE(__pyx_v_classes_boilerplate); if (unlikely(__pyx_t_5 == -1)) __PYX_ERR(0, 150, __pyx_L1_error) + __pyx_t_2 = ((__pyx_t_5 > 0) != 0); + if (__pyx_t_2) { - /* "gumbocy.pyx":69 - * # Build a dict with all the whitelisted attributes - * has_attrs = False - * attrs = False # <<<<<<<<<<<<<< - * for i in range(node.v.element.attributes.length): - * attr = node.v.element.attributes.data[i] + /* "gumbocy.pyx":151 + * classes_boilerplate = frozenset(options.get("classes_boilerplate") or []) + * if len(classes_boilerplate) > 0: + * self.has_classes_boilerplate = True # <<<<<<<<<<<<<< + * self.classes_boilerplate = new re2cy.RE2("^(?:" + "|".join(classes_boilerplate) + ")$") + * attributes_whitelist.add("class") */ - __Pyx_INCREF(Py_False); - __pyx_v_attrs = Py_False; + __pyx_v_self->has_classes_boilerplate = 1; - /* "gumbocy.pyx":70 - * has_attrs = False - * attrs = False - * for i in range(node.v.element.attributes.length): # <<<<<<<<<<<<<< - * attr = node.v.element.attributes.data[i] - * attr_name = str(attr.name) + /* "gumbocy.pyx":152 + * if len(classes_boilerplate) > 0: + * self.has_classes_boilerplate = True + * self.classes_boilerplate = new re2cy.RE2("^(?:" + "|".join(classes_boilerplate) + ")$") # <<<<<<<<<<<<<< + * attributes_whitelist.add("class") + * */ - __pyx_t_9 = __pyx_v_node->v.element.attributes.length; - for (__pyx_t_10 = 0; __pyx_t_10 < __pyx_t_9; __pyx_t_10+=1) { - __pyx_v_i = __pyx_t_10; + __pyx_t_1 = __Pyx_PyString_Join(__pyx_kp_s__2, __pyx_v_classes_boilerplate); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 152, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __pyx_t_3 = PyNumber_Add(__pyx_kp_s_, __pyx_t_1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 152, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __pyx_t_1 = PyNumber_Add(__pyx_t_3, __pyx_kp_s__3); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 152, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __pyx_t_6 = __Pyx_PyObject_AsString(__pyx_t_1); if (unlikely((!__pyx_t_6) && PyErr_Occurred())) __PYX_ERR(0, 152, __pyx_L1_error) + try { + __pyx_t_7 = new re2::RE2(__pyx_t_6); + } catch(...) { + __Pyx_CppExn2PyErr(); + __PYX_ERR(0, 152, __pyx_L1_error) + } + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __pyx_v_self->classes_boilerplate = __pyx_t_7; - /* "gumbocy.pyx":71 - * attrs = False - * for i in range(node.v.element.attributes.length): - * attr = node.v.element.attributes.data[i] # <<<<<<<<<<<<<< - * attr_name = str(attr.name) - * if attr_name in self.attributes_whitelist: + /* "gumbocy.pyx":153 + * self.has_classes_boilerplate = True + * self.classes_boilerplate = new re2cy.RE2("^(?:" + "|".join(classes_boilerplate) + ")$") + * attributes_whitelist.add("class") # <<<<<<<<<<<<<< + * + * ids_boilerplate = frozenset(options.get("ids_boilerplate") or []) */ - __pyx_v_attr = ((GumboAttribute *)(__pyx_v_node->v.element.attributes.data[__pyx_v_i])); + __pyx_t_8 = PySet_Add(__pyx_v_attributes_whitelist, __pyx_n_s_class); if (unlikely(__pyx_t_8 == -1)) __PYX_ERR(0, 153, __pyx_L1_error) - /* "gumbocy.pyx":72 - * for i in range(node.v.element.attributes.length): - * attr = node.v.element.attributes.data[i] - * attr_name = str(attr.name) # <<<<<<<<<<<<<< - * if attr_name in self.attributes_whitelist: - * if attr_name == b"class": + /* "gumbocy.pyx":150 + * + * classes_boilerplate = frozenset(options.get("classes_boilerplate") or []) + * if len(classes_boilerplate) > 0: # <<<<<<<<<<<<<< + * self.has_classes_boilerplate = True + * self.classes_boilerplate = new re2cy.RE2("^(?:" + "|".join(classes_boilerplate) + ")$") */ - __pyx_t_4 = __Pyx_PyBytes_FromString(__pyx_v_attr->name); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 72, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_4); - __pyx_t_3 = PyTuple_New(1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 72, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_3); - __Pyx_GIVEREF(__pyx_t_4); - PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_t_4); - __pyx_t_4 = 0; - __pyx_t_4 = __Pyx_PyObject_Call(((PyObject *)(&PyString_Type)), __pyx_t_3, NULL); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 72, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_4); - __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - __Pyx_XDECREF_SET(__pyx_v_attr_name, __pyx_t_4); - __pyx_t_4 = 0; + } - /* "gumbocy.pyx":73 - * attr = node.v.element.attributes.data[i] - * attr_name = str(attr.name) - * if attr_name in self.attributes_whitelist: # <<<<<<<<<<<<<< - * if attr_name == b"class": - * multiple_value = frozenset(_RE_SPLIT_WHITESPACE.split(attr.value.strip().lower())) + /* "gumbocy.pyx":155 + * attributes_whitelist.add("class") + * + * ids_boilerplate = frozenset(options.get("ids_boilerplate") or []) # <<<<<<<<<<<<<< + * if len(ids_boilerplate) > 0: + * self.has_ids_boilerplate = True */ - __pyx_t_1 = (__Pyx_PySequence_ContainsTF(__pyx_v_attr_name, __pyx_v_self->attributes_whitelist, Py_EQ)); if (unlikely(__pyx_t_1 < 0)) __PYX_ERR(0, 73, __pyx_L1_error) - __pyx_t_6 = (__pyx_t_1 != 0); - if (__pyx_t_6) { + if (unlikely(__pyx_v_options == Py_None)) { + PyErr_Format(PyExc_AttributeError, "'NoneType' object has no attribute '%s'", "get"); + __PYX_ERR(0, 155, __pyx_L1_error) + } + __pyx_t_3 = __Pyx_PyDict_GetItemDefault(__pyx_v_options, __pyx_n_s_ids_boilerplate, Py_None); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 155, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __pyx_t_2 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_2 < 0)) __PYX_ERR(0, 155, __pyx_L1_error) + if (!__pyx_t_2) { + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + } else { + __Pyx_INCREF(__pyx_t_3); + __pyx_t_1 = __pyx_t_3; + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + goto __pyx_L22_bool_binop_done; + } + __pyx_t_3 = PyList_New(0); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 155, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __Pyx_INCREF(__pyx_t_3); + __pyx_t_1 = __pyx_t_3; + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __pyx_L22_bool_binop_done:; + __pyx_t_3 = __Pyx_PyFrozenSet_New(__pyx_t_1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 155, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __pyx_v_ids_boilerplate = ((PyObject*)__pyx_t_3); + __pyx_t_3 = 0; - /* "gumbocy.pyx":74 - * attr_name = str(attr.name) - * if attr_name in self.attributes_whitelist: - * if attr_name == b"class": # <<<<<<<<<<<<<< - * multiple_value = frozenset(_RE_SPLIT_WHITESPACE.split(attr.value.strip().lower())) - * if len(multiple_value): + /* "gumbocy.pyx":156 + * + * ids_boilerplate = frozenset(options.get("ids_boilerplate") or []) + * if len(ids_boilerplate) > 0: # <<<<<<<<<<<<<< + * self.has_ids_boilerplate = True + * self.ids_boilerplate = new re2cy.RE2("^(?:" + "|".join(ids_boilerplate) + ")$") */ - __pyx_t_6 = (__Pyx_PyBytes_Equals(__pyx_v_attr_name, __pyx_n_b_class, Py_EQ)); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(0, 74, __pyx_L1_error) - if (__pyx_t_6) { + __pyx_t_5 = PySet_GET_SIZE(__pyx_v_ids_boilerplate); if (unlikely(__pyx_t_5 == -1)) __PYX_ERR(0, 156, __pyx_L1_error) + __pyx_t_2 = ((__pyx_t_5 > 0) != 0); + if (__pyx_t_2) { - /* "gumbocy.pyx":75 - * if attr_name in self.attributes_whitelist: - * if attr_name == b"class": - * multiple_value = frozenset(_RE_SPLIT_WHITESPACE.split(attr.value.strip().lower())) # <<<<<<<<<<<<<< - * if len(multiple_value): - * if self.has_classes_ignore: + /* "gumbocy.pyx":157 + * ids_boilerplate = frozenset(options.get("ids_boilerplate") or []) + * if len(ids_boilerplate) > 0: + * self.has_ids_boilerplate = True # <<<<<<<<<<<<<< + * self.ids_boilerplate = new re2cy.RE2("^(?:" + "|".join(ids_boilerplate) + ")$") + * attributes_whitelist.add("id") */ - __pyx_t_3 = __Pyx_GetModuleGlobalName(__pyx_n_s_RE_SPLIT_WHITESPACE); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 75, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_3); - __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_split); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 75, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_2); - __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - __pyx_t_12 = __Pyx_PyBytes_FromString(__pyx_v_attr->value); if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 75, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_12); - __pyx_t_13 = __Pyx_PyObject_GetAttrStr(__pyx_t_12, __pyx_n_s_strip); if (unlikely(!__pyx_t_13)) __PYX_ERR(0, 75, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_13); - __Pyx_DECREF(__pyx_t_12); __pyx_t_12 = 0; - __pyx_t_12 = NULL; - if (CYTHON_COMPILING_IN_CPYTHON && likely(PyMethod_Check(__pyx_t_13))) { - __pyx_t_12 = PyMethod_GET_SELF(__pyx_t_13); - if (likely(__pyx_t_12)) { - PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_13); - __Pyx_INCREF(__pyx_t_12); - __Pyx_INCREF(function); - __Pyx_DECREF_SET(__pyx_t_13, function); - } - } - if (__pyx_t_12) { - __pyx_t_11 = __Pyx_PyObject_CallOneArg(__pyx_t_13, __pyx_t_12); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 75, __pyx_L1_error) - __Pyx_DECREF(__pyx_t_12); __pyx_t_12 = 0; - } else { - __pyx_t_11 = __Pyx_PyObject_CallNoArg(__pyx_t_13); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 75, __pyx_L1_error) - } - __Pyx_GOTREF(__pyx_t_11); - __Pyx_DECREF(__pyx_t_13); __pyx_t_13 = 0; - __pyx_t_13 = __Pyx_PyObject_GetAttrStr(__pyx_t_11, __pyx_n_s_lower); if (unlikely(!__pyx_t_13)) __PYX_ERR(0, 75, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_13); - __Pyx_DECREF(__pyx_t_11); __pyx_t_11 = 0; - __pyx_t_11 = NULL; - if (CYTHON_COMPILING_IN_CPYTHON && likely(PyMethod_Check(__pyx_t_13))) { - __pyx_t_11 = PyMethod_GET_SELF(__pyx_t_13); - if (likely(__pyx_t_11)) { - PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_13); - __Pyx_INCREF(__pyx_t_11); - __Pyx_INCREF(function); - __Pyx_DECREF_SET(__pyx_t_13, function); - } - } - if (__pyx_t_11) { - __pyx_t_3 = __Pyx_PyObject_CallOneArg(__pyx_t_13, __pyx_t_11); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 75, __pyx_L1_error) - __Pyx_DECREF(__pyx_t_11); __pyx_t_11 = 0; - } else { - __pyx_t_3 = __Pyx_PyObject_CallNoArg(__pyx_t_13); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 75, __pyx_L1_error) - } - __Pyx_GOTREF(__pyx_t_3); - __Pyx_DECREF(__pyx_t_13); __pyx_t_13 = 0; - __pyx_t_13 = NULL; - if (CYTHON_COMPILING_IN_CPYTHON && unlikely(PyMethod_Check(__pyx_t_2))) { - __pyx_t_13 = PyMethod_GET_SELF(__pyx_t_2); - if (likely(__pyx_t_13)) { - PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_2); - __Pyx_INCREF(__pyx_t_13); - __Pyx_INCREF(function); - __Pyx_DECREF_SET(__pyx_t_2, function); - } - } - if (!__pyx_t_13) { - __pyx_t_4 = __Pyx_PyObject_CallOneArg(__pyx_t_2, __pyx_t_3); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 75, __pyx_L1_error) - __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - __Pyx_GOTREF(__pyx_t_4); - } else { - __pyx_t_11 = PyTuple_New(1+1); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 75, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_11); - __Pyx_GIVEREF(__pyx_t_13); PyTuple_SET_ITEM(__pyx_t_11, 0, __pyx_t_13); __pyx_t_13 = NULL; - __Pyx_GIVEREF(__pyx_t_3); - PyTuple_SET_ITEM(__pyx_t_11, 0+1, __pyx_t_3); - __pyx_t_3 = 0; - __pyx_t_4 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_11, NULL); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 75, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_4); - __Pyx_DECREF(__pyx_t_11); __pyx_t_11 = 0; - } - __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; - __pyx_t_2 = __Pyx_PyFrozenSet_New(__pyx_t_4); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 75, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_2); - __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; - __Pyx_XDECREF_SET(__pyx_v_multiple_value, ((PyObject*)__pyx_t_2)); - __pyx_t_2 = 0; + __pyx_v_self->has_ids_boilerplate = 1; - /* "gumbocy.pyx":76 - * if attr_name == b"class": - * multiple_value = frozenset(_RE_SPLIT_WHITESPACE.split(attr.value.strip().lower())) - * if len(multiple_value): # <<<<<<<<<<<<<< - * if self.has_classes_ignore: - * for v in multiple_value: + /* "gumbocy.pyx":158 + * if len(ids_boilerplate) > 0: + * self.has_ids_boilerplate = True + * self.ids_boilerplate = new re2cy.RE2("^(?:" + "|".join(ids_boilerplate) + ")$") # <<<<<<<<<<<<<< + * attributes_whitelist.add("id") + * */ - __pyx_t_14 = PySet_GET_SIZE(__pyx_v_multiple_value); if (unlikely(__pyx_t_14 == -1)) __PYX_ERR(0, 76, __pyx_L1_error) - __pyx_t_6 = (__pyx_t_14 != 0); - if (__pyx_t_6) { + __pyx_t_3 = __Pyx_PyString_Join(__pyx_kp_s__2, __pyx_v_ids_boilerplate); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 158, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __pyx_t_1 = PyNumber_Add(__pyx_kp_s_, __pyx_t_3); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 158, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __pyx_t_3 = PyNumber_Add(__pyx_t_1, __pyx_kp_s__3); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 158, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __pyx_t_6 = __Pyx_PyObject_AsString(__pyx_t_3); if (unlikely((!__pyx_t_6) && PyErr_Occurred())) __PYX_ERR(0, 158, __pyx_L1_error) + try { + __pyx_t_7 = new re2::RE2(__pyx_t_6); + } catch(...) { + __Pyx_CppExn2PyErr(); + __PYX_ERR(0, 158, __pyx_L1_error) + } + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __pyx_v_self->ids_boilerplate = __pyx_t_7; - /* "gumbocy.pyx":77 - * multiple_value = frozenset(_RE_SPLIT_WHITESPACE.split(attr.value.strip().lower())) - * if len(multiple_value): - * if self.has_classes_ignore: # <<<<<<<<<<<<<< - * for v in multiple_value: - * if v in self.classes_ignore: + /* "gumbocy.pyx":159 + * self.has_ids_boilerplate = True + * self.ids_boilerplate = new re2cy.RE2("^(?:" + "|".join(ids_boilerplate) + ")$") + * attributes_whitelist.add("id") # <<<<<<<<<<<<<< + * + * roles_boilerplate = frozenset(options.get("roles_boilerplate") or []) */ - __pyx_t_6 = (__pyx_v_self->has_classes_ignore != 0); - if (__pyx_t_6) { + __pyx_t_8 = PySet_Add(__pyx_v_attributes_whitelist, __pyx_n_s_id); if (unlikely(__pyx_t_8 == -1)) __PYX_ERR(0, 159, __pyx_L1_error) - /* "gumbocy.pyx":78 - * if len(multiple_value): - * if self.has_classes_ignore: - * for v in multiple_value: # <<<<<<<<<<<<<< - * if v in self.classes_ignore: - * return 0 + /* "gumbocy.pyx":156 + * + * ids_boilerplate = frozenset(options.get("ids_boilerplate") or []) + * if len(ids_boilerplate) > 0: # <<<<<<<<<<<<<< + * self.has_ids_boilerplate = True + * self.ids_boilerplate = new re2cy.RE2("^(?:" + "|".join(ids_boilerplate) + ")$") */ - __pyx_t_2 = PyObject_GetIter(__pyx_v_multiple_value); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 78, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_2); - __pyx_t_15 = Py_TYPE(__pyx_t_2)->tp_iternext; if (unlikely(!__pyx_t_15)) __PYX_ERR(0, 78, __pyx_L1_error) - for (;;) { - { - __pyx_t_4 = __pyx_t_15(__pyx_t_2); - if (unlikely(!__pyx_t_4)) { - PyObject* exc_type = PyErr_Occurred(); - if (exc_type) { - if (likely(exc_type == PyExc_StopIteration || PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear(); - else __PYX_ERR(0, 78, __pyx_L1_error) - } - break; - } - __Pyx_GOTREF(__pyx_t_4); - } - __Pyx_XDECREF_SET(__pyx_v_v, __pyx_t_4); - __pyx_t_4 = 0; + } - /* "gumbocy.pyx":79 - * if self.has_classes_ignore: - * for v in multiple_value: - * if v in self.classes_ignore: # <<<<<<<<<<<<<< - * return 0 + /* "gumbocy.pyx":161 + * attributes_whitelist.add("id") * + * roles_boilerplate = frozenset(options.get("roles_boilerplate") or []) # <<<<<<<<<<<<<< + * if len(roles_boilerplate) > 0: + * self.has_roles_boilerplate = True */ - __pyx_t_6 = (__Pyx_PySequence_ContainsTF(__pyx_v_v, __pyx_v_self->classes_ignore, Py_EQ)); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(0, 79, __pyx_L1_error) - __pyx_t_1 = (__pyx_t_6 != 0); - if (__pyx_t_1) { + if (unlikely(__pyx_v_options == Py_None)) { + PyErr_Format(PyExc_AttributeError, "'NoneType' object has no attribute '%s'", "get"); + __PYX_ERR(0, 161, __pyx_L1_error) + } + __pyx_t_1 = __Pyx_PyDict_GetItemDefault(__pyx_v_options, __pyx_n_s_roles_boilerplate, Py_None); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 161, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __pyx_t_2 = __Pyx_PyObject_IsTrue(__pyx_t_1); if (unlikely(__pyx_t_2 < 0)) __PYX_ERR(0, 161, __pyx_L1_error) + if (!__pyx_t_2) { + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + } else { + __Pyx_INCREF(__pyx_t_1); + __pyx_t_3 = __pyx_t_1; + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + goto __pyx_L25_bool_binop_done; + } + __pyx_t_1 = PyList_New(0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 161, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_INCREF(__pyx_t_1); + __pyx_t_3 = __pyx_t_1; + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __pyx_L25_bool_binop_done:; + __pyx_t_1 = __Pyx_PyFrozenSet_New(__pyx_t_3); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 161, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __pyx_v_roles_boilerplate = ((PyObject*)__pyx_t_1); + __pyx_t_1 = 0; - /* "gumbocy.pyx":80 - * for v in multiple_value: - * if v in self.classes_ignore: - * return 0 # <<<<<<<<<<<<<< + /* "gumbocy.pyx":162 * - * if not has_attrs: + * roles_boilerplate = frozenset(options.get("roles_boilerplate") or []) + * if len(roles_boilerplate) > 0: # <<<<<<<<<<<<<< + * self.has_roles_boilerplate = True + * self.roles_boilerplate = new re2cy.RE2("^(?:" + "|".join(roles_boilerplate) + ")$") */ - __pyx_r = 0; - __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; - goto __pyx_L0; + __pyx_t_5 = PySet_GET_SIZE(__pyx_v_roles_boilerplate); if (unlikely(__pyx_t_5 == -1)) __PYX_ERR(0, 162, __pyx_L1_error) + __pyx_t_2 = ((__pyx_t_5 > 0) != 0); + if (__pyx_t_2) { - /* "gumbocy.pyx":79 - * if self.has_classes_ignore: - * for v in multiple_value: - * if v in self.classes_ignore: # <<<<<<<<<<<<<< - * return 0 + /* "gumbocy.pyx":163 + * roles_boilerplate = frozenset(options.get("roles_boilerplate") or []) + * if len(roles_boilerplate) > 0: + * self.has_roles_boilerplate = True # <<<<<<<<<<<<<< + * self.roles_boilerplate = new re2cy.RE2("^(?:" + "|".join(roles_boilerplate) + ")$") + * attributes_whitelist.add("role") + */ + __pyx_v_self->has_roles_boilerplate = 1; + + /* "gumbocy.pyx":164 + * if len(roles_boilerplate) > 0: + * self.has_roles_boilerplate = True + * self.roles_boilerplate = new re2cy.RE2("^(?:" + "|".join(roles_boilerplate) + ")$") # <<<<<<<<<<<<<< + * attributes_whitelist.add("role") * */ - } + __pyx_t_1 = __Pyx_PyString_Join(__pyx_kp_s__2, __pyx_v_roles_boilerplate); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 164, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __pyx_t_3 = PyNumber_Add(__pyx_kp_s_, __pyx_t_1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 164, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __pyx_t_1 = PyNumber_Add(__pyx_t_3, __pyx_kp_s__3); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 164, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __pyx_t_6 = __Pyx_PyObject_AsString(__pyx_t_1); if (unlikely((!__pyx_t_6) && PyErr_Occurred())) __PYX_ERR(0, 164, __pyx_L1_error) + try { + __pyx_t_7 = new re2::RE2(__pyx_t_6); + } catch(...) { + __Pyx_CppExn2PyErr(); + __PYX_ERR(0, 164, __pyx_L1_error) + } + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __pyx_v_self->roles_boilerplate = __pyx_t_7; - /* "gumbocy.pyx":78 - * if len(multiple_value): - * if self.has_classes_ignore: - * for v in multiple_value: # <<<<<<<<<<<<<< - * if v in self.classes_ignore: - * return 0 + /* "gumbocy.pyx":165 + * self.has_roles_boilerplate = True + * self.roles_boilerplate = new re2cy.RE2("^(?:" + "|".join(roles_boilerplate) + ")$") + * attributes_whitelist.add("role") # <<<<<<<<<<<<<< + * + * metas_whitelist = frozenset(options.get("metas_whitelist") or []) */ - } - __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + __pyx_t_8 = PySet_Add(__pyx_v_attributes_whitelist, __pyx_n_s_role); if (unlikely(__pyx_t_8 == -1)) __PYX_ERR(0, 165, __pyx_L1_error) - /* "gumbocy.pyx":77 - * multiple_value = frozenset(_RE_SPLIT_WHITESPACE.split(attr.value.strip().lower())) - * if len(multiple_value): - * if self.has_classes_ignore: # <<<<<<<<<<<<<< - * for v in multiple_value: - * if v in self.classes_ignore: + /* "gumbocy.pyx":162 + * + * roles_boilerplate = frozenset(options.get("roles_boilerplate") or []) + * if len(roles_boilerplate) > 0: # <<<<<<<<<<<<<< + * self.has_roles_boilerplate = True + * self.roles_boilerplate = new re2cy.RE2("^(?:" + "|".join(roles_boilerplate) + ")$") */ - } + } - /* "gumbocy.pyx":82 - * return 0 + /* "gumbocy.pyx":167 + * attributes_whitelist.add("role") * - * if not has_attrs: # <<<<<<<<<<<<<< - * attrs = {} - * has_attrs = True + * metas_whitelist = frozenset(options.get("metas_whitelist") or []) # <<<<<<<<<<<<<< + * if len(metas_whitelist) > 0: + * self.has_metas_whitelist = True */ - __pyx_t_1 = ((!(__pyx_v_has_attrs != 0)) != 0); - if (__pyx_t_1) { + if (unlikely(__pyx_v_options == Py_None)) { + PyErr_Format(PyExc_AttributeError, "'NoneType' object has no attribute '%s'", "get"); + __PYX_ERR(0, 167, __pyx_L1_error) + } + __pyx_t_3 = __Pyx_PyDict_GetItemDefault(__pyx_v_options, __pyx_n_s_metas_whitelist, Py_None); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 167, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __pyx_t_2 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_2 < 0)) __PYX_ERR(0, 167, __pyx_L1_error) + if (!__pyx_t_2) { + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + } else { + __Pyx_INCREF(__pyx_t_3); + __pyx_t_1 = __pyx_t_3; + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + goto __pyx_L28_bool_binop_done; + } + __pyx_t_3 = PyList_New(0); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 167, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __Pyx_INCREF(__pyx_t_3); + __pyx_t_1 = __pyx_t_3; + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __pyx_L28_bool_binop_done:; + __pyx_t_3 = __Pyx_PyFrozenSet_New(__pyx_t_1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 167, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __pyx_v_metas_whitelist = ((PyObject*)__pyx_t_3); + __pyx_t_3 = 0; - /* "gumbocy.pyx":83 + /* "gumbocy.pyx":168 * - * if not has_attrs: - * attrs = {} # <<<<<<<<<<<<<< - * has_attrs = True - * attrs[attr_name] = multiple_value + * metas_whitelist = frozenset(options.get("metas_whitelist") or []) + * if len(metas_whitelist) > 0: # <<<<<<<<<<<<<< + * self.has_metas_whitelist = True + * self.metas_whitelist = new re2cy.RE2("^(?:" + "|".join(metas_whitelist) + ")$") */ - __pyx_t_2 = PyDict_New(); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 83, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_2); - __Pyx_DECREF_SET(__pyx_v_attrs, __pyx_t_2); - __pyx_t_2 = 0; + __pyx_t_5 = PySet_GET_SIZE(__pyx_v_metas_whitelist); if (unlikely(__pyx_t_5 == -1)) __PYX_ERR(0, 168, __pyx_L1_error) + __pyx_t_2 = ((__pyx_t_5 > 0) != 0); + if (__pyx_t_2) { - /* "gumbocy.pyx":84 - * if not has_attrs: - * attrs = {} - * has_attrs = True # <<<<<<<<<<<<<< - * attrs[attr_name] = multiple_value + /* "gumbocy.pyx":169 + * metas_whitelist = frozenset(options.get("metas_whitelist") or []) + * if len(metas_whitelist) > 0: + * self.has_metas_whitelist = True # <<<<<<<<<<<<<< + * self.metas_whitelist = new re2cy.RE2("^(?:" + "|".join(metas_whitelist) + ")$") + * attributes_whitelist.add("name") + */ + __pyx_v_self->has_metas_whitelist = 1; + + /* "gumbocy.pyx":170 + * if len(metas_whitelist) > 0: + * self.has_metas_whitelist = True + * self.metas_whitelist = new re2cy.RE2("^(?:" + "|".join(metas_whitelist) + ")$") # <<<<<<<<<<<<<< + * attributes_whitelist.add("name") + * attributes_whitelist.add("property") + */ + __pyx_t_3 = __Pyx_PyString_Join(__pyx_kp_s__2, __pyx_v_metas_whitelist); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 170, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __pyx_t_1 = PyNumber_Add(__pyx_kp_s_, __pyx_t_3); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 170, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __pyx_t_3 = PyNumber_Add(__pyx_t_1, __pyx_kp_s__3); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 170, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __pyx_t_6 = __Pyx_PyObject_AsString(__pyx_t_3); if (unlikely((!__pyx_t_6) && PyErr_Occurred())) __PYX_ERR(0, 170, __pyx_L1_error) + try { + __pyx_t_7 = new re2::RE2(__pyx_t_6); + } catch(...) { + __Pyx_CppExn2PyErr(); + __PYX_ERR(0, 170, __pyx_L1_error) + } + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __pyx_v_self->metas_whitelist = __pyx_t_7; + + /* "gumbocy.pyx":171 + * self.has_metas_whitelist = True + * self.metas_whitelist = new re2cy.RE2("^(?:" + "|".join(metas_whitelist) + ")$") + * attributes_whitelist.add("name") # <<<<<<<<<<<<<< + * attributes_whitelist.add("property") + * attributes_whitelist.add("content") + */ + __pyx_t_8 = PySet_Add(__pyx_v_attributes_whitelist, __pyx_n_s_name); if (unlikely(__pyx_t_8 == -1)) __PYX_ERR(0, 171, __pyx_L1_error) + + /* "gumbocy.pyx":172 + * self.metas_whitelist = new re2cy.RE2("^(?:" + "|".join(metas_whitelist) + ")$") + * attributes_whitelist.add("name") + * attributes_whitelist.add("property") # <<<<<<<<<<<<<< + * attributes_whitelist.add("content") * */ - __pyx_v_has_attrs = 1; + __pyx_t_8 = PySet_Add(__pyx_v_attributes_whitelist, __pyx_n_s_property); if (unlikely(__pyx_t_8 == -1)) __PYX_ERR(0, 172, __pyx_L1_error) - /* "gumbocy.pyx":82 - * return 0 + /* "gumbocy.pyx":173 + * attributes_whitelist.add("name") + * attributes_whitelist.add("property") + * attributes_whitelist.add("content") # <<<<<<<<<<<<<< * - * if not has_attrs: # <<<<<<<<<<<<<< - * attrs = {} - * has_attrs = True + * # Some options add attributes to the whitelist */ - } + __pyx_t_8 = PySet_Add(__pyx_v_attributes_whitelist, __pyx_n_s_content); if (unlikely(__pyx_t_8 == -1)) __PYX_ERR(0, 173, __pyx_L1_error) - /* "gumbocy.pyx":85 - * attrs = {} - * has_attrs = True - * attrs[attr_name] = multiple_value # <<<<<<<<<<<<<< + /* "gumbocy.pyx":168 * - * else: + * metas_whitelist = frozenset(options.get("metas_whitelist") or []) + * if len(metas_whitelist) > 0: # <<<<<<<<<<<<<< + * self.has_metas_whitelist = True + * self.metas_whitelist = new re2cy.RE2("^(?:" + "|".join(metas_whitelist) + ")$") */ - if (unlikely(PyObject_SetItem(__pyx_v_attrs, __pyx_v_attr_name, __pyx_v_multiple_value) < 0)) __PYX_ERR(0, 85, __pyx_L1_error) + } - /* "gumbocy.pyx":76 - * if attr_name == b"class": - * multiple_value = frozenset(_RE_SPLIT_WHITESPACE.split(attr.value.strip().lower())) - * if len(multiple_value): # <<<<<<<<<<<<<< - * if self.has_classes_ignore: - * for v in multiple_value: + /* "gumbocy.pyx":176 + * + * # Some options add attributes to the whitelist + * if self.analyze_external_hyperlinks or self.analyze_internal_hyperlinks: # <<<<<<<<<<<<<< + * attributes_whitelist.add("href") + * attributes_whitelist.add("rel") */ - } + __pyx_t_9 = (__pyx_v_self->analyze_external_hyperlinks != 0); + if (!__pyx_t_9) { + } else { + __pyx_t_2 = __pyx_t_9; + goto __pyx_L32_bool_binop_done; + } + __pyx_t_9 = (__pyx_v_self->analyze_internal_hyperlinks != 0); + __pyx_t_2 = __pyx_t_9; + __pyx_L32_bool_binop_done:; + if (__pyx_t_2) { - /* "gumbocy.pyx":74 - * attr_name = str(attr.name) - * if attr_name in self.attributes_whitelist: - * if attr_name == b"class": # <<<<<<<<<<<<<< - * multiple_value = frozenset(_RE_SPLIT_WHITESPACE.split(attr.value.strip().lower())) - * if len(multiple_value): + /* "gumbocy.pyx":177 + * # Some options add attributes to the whitelist + * if self.analyze_external_hyperlinks or self.analyze_internal_hyperlinks: + * attributes_whitelist.add("href") # <<<<<<<<<<<<<< + * attributes_whitelist.add("rel") + * */ - goto __pyx_L13; - } + __pyx_t_8 = PySet_Add(__pyx_v_attributes_whitelist, __pyx_n_s_href); if (unlikely(__pyx_t_8 == -1)) __PYX_ERR(0, 177, __pyx_L1_error) - /* "gumbocy.pyx":89 - * else: + /* "gumbocy.pyx":178 + * if self.analyze_external_hyperlinks or self.analyze_internal_hyperlinks: + * attributes_whitelist.add("href") + * attributes_whitelist.add("rel") # <<<<<<<<<<<<<< * - * if not has_attrs: # <<<<<<<<<<<<<< - * attrs = {} - * has_attrs = True + * # FInally, freeze the attributes whitelist */ - /*else*/ { - __pyx_t_1 = ((!(__pyx_v_has_attrs != 0)) != 0); - if (__pyx_t_1) { + __pyx_t_8 = PySet_Add(__pyx_v_attributes_whitelist, __pyx_n_s_rel); if (unlikely(__pyx_t_8 == -1)) __PYX_ERR(0, 178, __pyx_L1_error) - /* "gumbocy.pyx":90 + /* "gumbocy.pyx":176 * - * if not has_attrs: - * attrs = {} # <<<<<<<<<<<<<< - * has_attrs = True - * attrs[attr_name] = attr.value + * # Some options add attributes to the whitelist + * if self.analyze_external_hyperlinks or self.analyze_internal_hyperlinks: # <<<<<<<<<<<<<< + * attributes_whitelist.add("href") + * attributes_whitelist.add("rel") */ - __pyx_t_2 = PyDict_New(); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 90, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_2); - __Pyx_DECREF_SET(__pyx_v_attrs, __pyx_t_2); - __pyx_t_2 = 0; + } - /* "gumbocy.pyx":91 - * if not has_attrs: - * attrs = {} - * has_attrs = True # <<<<<<<<<<<<<< - * attrs[attr_name] = attr.value + /* "gumbocy.pyx":181 * + * # FInally, freeze the attributes whitelist + * self.has_attributes_whitelist = len(attributes_whitelist) > 0 # <<<<<<<<<<<<<< + * if self.has_attributes_whitelist: + * self.attributes_whitelist = new re2cy.RE2("^(?:" + "|".join(attributes_whitelist) + ")$") */ - __pyx_v_has_attrs = 1; + __pyx_t_5 = PySet_GET_SIZE(__pyx_v_attributes_whitelist); if (unlikely(__pyx_t_5 == -1)) __PYX_ERR(0, 181, __pyx_L1_error) + __pyx_v_self->has_attributes_whitelist = (__pyx_t_5 > 0); - /* "gumbocy.pyx":89 - * else: + /* "gumbocy.pyx":182 + * # FInally, freeze the attributes whitelist + * self.has_attributes_whitelist = len(attributes_whitelist) > 0 + * if self.has_attributes_whitelist: # <<<<<<<<<<<<<< + * self.attributes_whitelist = new re2cy.RE2("^(?:" + "|".join(attributes_whitelist) + ")$") * - * if not has_attrs: # <<<<<<<<<<<<<< - * attrs = {} - * has_attrs = True */ - } + __pyx_t_2 = (__pyx_v_self->has_attributes_whitelist != 0); + if (__pyx_t_2) { - /* "gumbocy.pyx":92 - * attrs = {} - * has_attrs = True - * attrs[attr_name] = attr.value # <<<<<<<<<<<<<< + /* "gumbocy.pyx":183 + * self.has_attributes_whitelist = len(attributes_whitelist) > 0 + * if self.has_attributes_whitelist: + * self.attributes_whitelist = new re2cy.RE2("^(?:" + "|".join(attributes_whitelist) + ")$") # <<<<<<<<<<<<<< * - * if not has_attrs: + * self.tags_ignore_head_only.insert(gumbocy.GUMBO_TAG_BODY) */ - __pyx_t_2 = __Pyx_PyBytes_FromString(__pyx_v_attr->value); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 92, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_2); - if (unlikely(PyObject_SetItem(__pyx_v_attrs, __pyx_v_attr_name, __pyx_t_2) < 0)) __PYX_ERR(0, 92, __pyx_L1_error) - __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; - } - __pyx_L13:; + __pyx_t_3 = __Pyx_PyString_Join(__pyx_kp_s__2, __pyx_v_attributes_whitelist); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 183, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __pyx_t_1 = PyNumber_Add(__pyx_kp_s_, __pyx_t_3); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 183, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __pyx_t_3 = PyNumber_Add(__pyx_t_1, __pyx_kp_s__3); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 183, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __pyx_t_6 = __Pyx_PyObject_AsString(__pyx_t_3); if (unlikely((!__pyx_t_6) && PyErr_Occurred())) __PYX_ERR(0, 183, __pyx_L1_error) + try { + __pyx_t_7 = new re2::RE2(__pyx_t_6); + } catch(...) { + __Pyx_CppExn2PyErr(); + __PYX_ERR(0, 183, __pyx_L1_error) + } + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __pyx_v_self->attributes_whitelist = __pyx_t_7; - /* "gumbocy.pyx":73 - * attr = node.v.element.attributes.data[i] - * attr_name = str(attr.name) - * if attr_name in self.attributes_whitelist: # <<<<<<<<<<<<<< - * if attr_name == b"class": - * multiple_value = frozenset(_RE_SPLIT_WHITESPACE.split(attr.value.strip().lower())) + /* "gumbocy.pyx":182 + * # FInally, freeze the attributes whitelist + * self.has_attributes_whitelist = len(attributes_whitelist) > 0 + * if self.has_attributes_whitelist: # <<<<<<<<<<<<<< + * self.attributes_whitelist = new re2cy.RE2("^(?:" + "|".join(attributes_whitelist) + ")$") + * */ - } - } + } - /* "gumbocy.pyx":94 - * attrs[attr_name] = attr.value - * - * if not has_attrs: # <<<<<<<<<<<<<< - * self.nodes.append((level, tag_name)) + /* "gumbocy.pyx":185 + * self.attributes_whitelist = new re2cy.RE2("^(?:" + "|".join(attributes_whitelist) + ")$") * + * self.tags_ignore_head_only.insert(gumbocy.GUMBO_TAG_BODY) # <<<<<<<<<<<<<< + * self.tags_ignore_head_only.insert(gumbocy.GUMBO_TAG_P) + * self.tags_ignore_head_only.insert(gumbocy.GUMBO_TAG_DIV) */ - __pyx_t_1 = ((!(__pyx_v_has_attrs != 0)) != 0); - if (__pyx_t_1) { + __pyx_v_self->tags_ignore_head_only.insert(GUMBO_TAG_BODY); - /* "gumbocy.pyx":95 + /* "gumbocy.pyx":186 * - * if not has_attrs: - * self.nodes.append((level, tag_name)) # <<<<<<<<<<<<<< + * self.tags_ignore_head_only.insert(gumbocy.GUMBO_TAG_BODY) + * self.tags_ignore_head_only.insert(gumbocy.GUMBO_TAG_P) # <<<<<<<<<<<<<< + * self.tags_ignore_head_only.insert(gumbocy.GUMBO_TAG_DIV) + * self.tags_ignore_head_only.insert(gumbocy.GUMBO_TAG_SPAN) + */ + __pyx_v_self->tags_ignore_head_only.insert(GUMBO_TAG_P); + + /* "gumbocy.pyx":187 + * self.tags_ignore_head_only.insert(gumbocy.GUMBO_TAG_BODY) + * self.tags_ignore_head_only.insert(gumbocy.GUMBO_TAG_P) + * self.tags_ignore_head_only.insert(gumbocy.GUMBO_TAG_DIV) # <<<<<<<<<<<<<< + * self.tags_ignore_head_only.insert(gumbocy.GUMBO_TAG_SPAN) * - * else: */ - if (unlikely(__pyx_v_self->nodes == Py_None)) { - PyErr_Format(PyExc_AttributeError, "'NoneType' object has no attribute '%s'", "append"); - __PYX_ERR(0, 95, __pyx_L1_error) - } - __pyx_t_2 = __Pyx_PyInt_From_int(__pyx_v_level); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 95, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_2); - __pyx_t_4 = __Pyx_PyBytes_FromString(__pyx_v_tag_name); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 95, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_4); - __pyx_t_11 = PyTuple_New(2); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 95, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_11); - __Pyx_GIVEREF(__pyx_t_2); - PyTuple_SET_ITEM(__pyx_t_11, 0, __pyx_t_2); - __Pyx_GIVEREF(__pyx_t_4); - PyTuple_SET_ITEM(__pyx_t_11, 1, __pyx_t_4); - __pyx_t_2 = 0; - __pyx_t_4 = 0; - __pyx_t_5 = __Pyx_PyList_Append(__pyx_v_self->nodes, __pyx_t_11); if (unlikely(__pyx_t_5 == -1)) __PYX_ERR(0, 95, __pyx_L1_error) - __Pyx_DECREF(__pyx_t_11); __pyx_t_11 = 0; + __pyx_v_self->tags_ignore_head_only.insert(GUMBO_TAG_DIV); - /* "gumbocy.pyx":94 - * attrs[attr_name] = attr.value + /* "gumbocy.pyx":188 + * self.tags_ignore_head_only.insert(gumbocy.GUMBO_TAG_P) + * self.tags_ignore_head_only.insert(gumbocy.GUMBO_TAG_DIV) + * self.tags_ignore_head_only.insert(gumbocy.GUMBO_TAG_SPAN) # <<<<<<<<<<<<<< * - * if not has_attrs: # <<<<<<<<<<<<<< - * self.nodes.append((level, tag_name)) + * for tag_name in options.get("tags_ignore", []): + */ + __pyx_v_self->tags_ignore_head_only.insert(GUMBO_TAG_SPAN); + + /* "gumbocy.pyx":190 + * self.tags_ignore_head_only.insert(gumbocy.GUMBO_TAG_SPAN) * + * for tag_name in options.get("tags_ignore", []): # <<<<<<<<<<<<<< + * tag = gumbocy.gumbo_tag_enum(tag_name) + * if tag != gumbocy.GUMBO_TAG_UNKNOWN: */ - goto __pyx_L21; + if (unlikely(__pyx_v_options == Py_None)) { + PyErr_Format(PyExc_AttributeError, "'NoneType' object has no attribute '%s'", "get"); + __PYX_ERR(0, 190, __pyx_L1_error) + } + __pyx_t_3 = PyList_New(0); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 190, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __pyx_t_1 = __Pyx_PyDict_GetItemDefault(__pyx_v_options, __pyx_n_s_tags_ignore, __pyx_t_3); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 190, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + if (likely(PyList_CheckExact(__pyx_t_1)) || PyTuple_CheckExact(__pyx_t_1)) { + __pyx_t_3 = __pyx_t_1; __Pyx_INCREF(__pyx_t_3); __pyx_t_5 = 0; + __pyx_t_10 = NULL; + } else { + __pyx_t_5 = -1; __pyx_t_3 = PyObject_GetIter(__pyx_t_1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 190, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __pyx_t_10 = Py_TYPE(__pyx_t_3)->tp_iternext; if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 190, __pyx_L1_error) + } + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + for (;;) { + if (likely(!__pyx_t_10)) { + if (likely(PyList_CheckExact(__pyx_t_3))) { + if (__pyx_t_5 >= PyList_GET_SIZE(__pyx_t_3)) break; + #if CYTHON_COMPILING_IN_CPYTHON + __pyx_t_1 = PyList_GET_ITEM(__pyx_t_3, __pyx_t_5); __Pyx_INCREF(__pyx_t_1); __pyx_t_5++; if (unlikely(0 < 0)) __PYX_ERR(0, 190, __pyx_L1_error) + #else + __pyx_t_1 = PySequence_ITEM(__pyx_t_3, __pyx_t_5); __pyx_t_5++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 190, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + #endif + } else { + if (__pyx_t_5 >= PyTuple_GET_SIZE(__pyx_t_3)) break; + #if CYTHON_COMPILING_IN_CPYTHON + __pyx_t_1 = PyTuple_GET_ITEM(__pyx_t_3, __pyx_t_5); __Pyx_INCREF(__pyx_t_1); __pyx_t_5++; if (unlikely(0 < 0)) __PYX_ERR(0, 190, __pyx_L1_error) + #else + __pyx_t_1 = PySequence_ITEM(__pyx_t_3, __pyx_t_5); __pyx_t_5++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 190, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + #endif } + } else { + __pyx_t_1 = __pyx_t_10(__pyx_t_3); + if (unlikely(!__pyx_t_1)) { + PyObject* exc_type = PyErr_Occurred(); + if (exc_type) { + if (likely(exc_type == PyExc_StopIteration || PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear(); + else __PYX_ERR(0, 190, __pyx_L1_error) + } + break; + } + __Pyx_GOTREF(__pyx_t_1); + } + __Pyx_XDECREF_SET(__pyx_v_tag_name, __pyx_t_1); + __pyx_t_1 = 0; - /* "gumbocy.pyx":99 - * else: + /* "gumbocy.pyx":191 * - * if self.has_ids_ignore: # <<<<<<<<<<<<<< - * if attrs.get("id") and attrs["id"].lower() in self.ids_ignore: - * return 0 + * for tag_name in options.get("tags_ignore", []): + * tag = gumbocy.gumbo_tag_enum(tag_name) # <<<<<<<<<<<<<< + * if tag != gumbocy.GUMBO_TAG_UNKNOWN: + * self.tags_ignore.insert( gumbocy.gumbo_tag_enum(tag_name)) */ - /*else*/ { - __pyx_t_1 = (__pyx_v_self->has_ids_ignore != 0); - if (__pyx_t_1) { + __pyx_t_11 = __Pyx_PyObject_AsString(__pyx_v_tag_name); if (unlikely((!__pyx_t_11) && PyErr_Occurred())) __PYX_ERR(0, 191, __pyx_L1_error) + __pyx_v_tag = gumbo_tag_enum(__pyx_t_11); - /* "gumbocy.pyx":100 - * - * if self.has_ids_ignore: - * if attrs.get("id") and attrs["id"].lower() in self.ids_ignore: # <<<<<<<<<<<<<< - * return 0 + /* "gumbocy.pyx":192 + * for tag_name in options.get("tags_ignore", []): + * tag = gumbocy.gumbo_tag_enum(tag_name) + * if tag != gumbocy.GUMBO_TAG_UNKNOWN: # <<<<<<<<<<<<<< + * self.tags_ignore.insert( gumbocy.gumbo_tag_enum(tag_name)) * */ - __pyx_t_11 = __Pyx_PyObject_GetAttrStr(__pyx_v_attrs, __pyx_n_s_get); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 100, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_11); - __pyx_t_4 = __Pyx_PyObject_Call(__pyx_t_11, __pyx_tuple__2, NULL); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 100, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_4); - __Pyx_DECREF(__pyx_t_11); __pyx_t_11 = 0; - __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(0, 100, __pyx_L1_error) - __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; - if (__pyx_t_6) { - } else { - __pyx_t_1 = __pyx_t_6; - goto __pyx_L24_bool_binop_done; - } - __pyx_t_11 = PyObject_GetItem(__pyx_v_attrs, __pyx_n_s_id); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 100, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_11); - __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_11, __pyx_n_s_lower); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 100, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_2); - __Pyx_DECREF(__pyx_t_11); __pyx_t_11 = 0; - __pyx_t_11 = NULL; - if (CYTHON_COMPILING_IN_CPYTHON && likely(PyMethod_Check(__pyx_t_2))) { - __pyx_t_11 = PyMethod_GET_SELF(__pyx_t_2); - if (likely(__pyx_t_11)) { - PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_2); - __Pyx_INCREF(__pyx_t_11); - __Pyx_INCREF(function); - __Pyx_DECREF_SET(__pyx_t_2, function); - } - } - if (__pyx_t_11) { - __pyx_t_4 = __Pyx_PyObject_CallOneArg(__pyx_t_2, __pyx_t_11); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 100, __pyx_L1_error) - __Pyx_DECREF(__pyx_t_11); __pyx_t_11 = 0; - } else { - __pyx_t_4 = __Pyx_PyObject_CallNoArg(__pyx_t_2); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 100, __pyx_L1_error) - } - __Pyx_GOTREF(__pyx_t_4); - __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; - __pyx_t_6 = (__Pyx_PySequence_ContainsTF(__pyx_t_4, __pyx_v_self->ids_ignore, Py_EQ)); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(0, 100, __pyx_L1_error) - __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; - __pyx_t_16 = (__pyx_t_6 != 0); - __pyx_t_1 = __pyx_t_16; - __pyx_L24_bool_binop_done:; - if (__pyx_t_1) { + __pyx_t_2 = ((__pyx_v_tag != GUMBO_TAG_UNKNOWN) != 0); + if (__pyx_t_2) { - /* "gumbocy.pyx":101 - * if self.has_ids_ignore: - * if attrs.get("id") and attrs["id"].lower() in self.ids_ignore: - * return 0 # <<<<<<<<<<<<<< + /* "gumbocy.pyx":193 + * tag = gumbocy.gumbo_tag_enum(tag_name) + * if tag != gumbocy.GUMBO_TAG_UNKNOWN: + * self.tags_ignore.insert( gumbocy.gumbo_tag_enum(tag_name)) # <<<<<<<<<<<<<< * - * self.nodes.append((level, tag_name, attrs)) + * for tag_name in options.get("tags_boilerplate", []): */ - __pyx_r = 0; - goto __pyx_L0; + __pyx_t_11 = __Pyx_PyObject_AsString(__pyx_v_tag_name); if (unlikely((!__pyx_t_11) && PyErr_Occurred())) __PYX_ERR(0, 193, __pyx_L1_error) + __pyx_v_self->tags_ignore.insert(((int)gumbo_tag_enum(__pyx_t_11))); - /* "gumbocy.pyx":100 - * - * if self.has_ids_ignore: - * if attrs.get("id") and attrs["id"].lower() in self.ids_ignore: # <<<<<<<<<<<<<< - * return 0 + /* "gumbocy.pyx":192 + * for tag_name in options.get("tags_ignore", []): + * tag = gumbocy.gumbo_tag_enum(tag_name) + * if tag != gumbocy.GUMBO_TAG_UNKNOWN: # <<<<<<<<<<<<<< + * self.tags_ignore.insert( gumbocy.gumbo_tag_enum(tag_name)) * */ - } + } - /* "gumbocy.pyx":99 - * else: + /* "gumbocy.pyx":190 + * self.tags_ignore_head_only.insert(gumbocy.GUMBO_TAG_SPAN) * - * if self.has_ids_ignore: # <<<<<<<<<<<<<< - * if attrs.get("id") and attrs["id"].lower() in self.ids_ignore: - * return 0 + * for tag_name in options.get("tags_ignore", []): # <<<<<<<<<<<<<< + * tag = gumbocy.gumbo_tag_enum(tag_name) + * if tag != gumbocy.GUMBO_TAG_UNKNOWN: */ - } + } + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - /* "gumbocy.pyx":103 - * return 0 - * - * self.nodes.append((level, tag_name, attrs)) # <<<<<<<<<<<<<< + /* "gumbocy.pyx":195 + * self.tags_ignore.insert( gumbocy.gumbo_tag_enum(tag_name)) * - * else: + * for tag_name in options.get("tags_boilerplate", []): # <<<<<<<<<<<<<< + * tag = gumbocy.gumbo_tag_enum(tag_name) + * if tag != gumbocy.GUMBO_TAG_UNKNOWN: */ - if (unlikely(__pyx_v_self->nodes == Py_None)) { - PyErr_Format(PyExc_AttributeError, "'NoneType' object has no attribute '%s'", "append"); - __PYX_ERR(0, 103, __pyx_L1_error) + if (unlikely(__pyx_v_options == Py_None)) { + PyErr_Format(PyExc_AttributeError, "'NoneType' object has no attribute '%s'", "get"); + __PYX_ERR(0, 195, __pyx_L1_error) + } + __pyx_t_3 = PyList_New(0); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 195, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __pyx_t_1 = __Pyx_PyDict_GetItemDefault(__pyx_v_options, __pyx_n_s_tags_boilerplate, __pyx_t_3); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 195, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + if (likely(PyList_CheckExact(__pyx_t_1)) || PyTuple_CheckExact(__pyx_t_1)) { + __pyx_t_3 = __pyx_t_1; __Pyx_INCREF(__pyx_t_3); __pyx_t_5 = 0; + __pyx_t_10 = NULL; + } else { + __pyx_t_5 = -1; __pyx_t_3 = PyObject_GetIter(__pyx_t_1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 195, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __pyx_t_10 = Py_TYPE(__pyx_t_3)->tp_iternext; if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 195, __pyx_L1_error) + } + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + for (;;) { + if (likely(!__pyx_t_10)) { + if (likely(PyList_CheckExact(__pyx_t_3))) { + if (__pyx_t_5 >= PyList_GET_SIZE(__pyx_t_3)) break; + #if CYTHON_COMPILING_IN_CPYTHON + __pyx_t_1 = PyList_GET_ITEM(__pyx_t_3, __pyx_t_5); __Pyx_INCREF(__pyx_t_1); __pyx_t_5++; if (unlikely(0 < 0)) __PYX_ERR(0, 195, __pyx_L1_error) + #else + __pyx_t_1 = PySequence_ITEM(__pyx_t_3, __pyx_t_5); __pyx_t_5++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 195, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + #endif + } else { + if (__pyx_t_5 >= PyTuple_GET_SIZE(__pyx_t_3)) break; + #if CYTHON_COMPILING_IN_CPYTHON + __pyx_t_1 = PyTuple_GET_ITEM(__pyx_t_3, __pyx_t_5); __Pyx_INCREF(__pyx_t_1); __pyx_t_5++; if (unlikely(0 < 0)) __PYX_ERR(0, 195, __pyx_L1_error) + #else + __pyx_t_1 = PySequence_ITEM(__pyx_t_3, __pyx_t_5); __pyx_t_5++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 195, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + #endif + } + } else { + __pyx_t_1 = __pyx_t_10(__pyx_t_3); + if (unlikely(!__pyx_t_1)) { + PyObject* exc_type = PyErr_Occurred(); + if (exc_type) { + if (likely(exc_type == PyExc_StopIteration || PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear(); + else __PYX_ERR(0, 195, __pyx_L1_error) } - __pyx_t_4 = __Pyx_PyInt_From_int(__pyx_v_level); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 103, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_4); - __pyx_t_2 = __Pyx_PyBytes_FromString(__pyx_v_tag_name); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 103, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_2); - __pyx_t_11 = PyTuple_New(3); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 103, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_11); - __Pyx_GIVEREF(__pyx_t_4); - PyTuple_SET_ITEM(__pyx_t_11, 0, __pyx_t_4); - __Pyx_GIVEREF(__pyx_t_2); - PyTuple_SET_ITEM(__pyx_t_11, 1, __pyx_t_2); - __Pyx_INCREF(__pyx_v_attrs); - __Pyx_GIVEREF(__pyx_v_attrs); - PyTuple_SET_ITEM(__pyx_t_11, 2, __pyx_v_attrs); - __pyx_t_4 = 0; - __pyx_t_2 = 0; - __pyx_t_5 = __Pyx_PyList_Append(__pyx_v_self->nodes, __pyx_t_11); if (unlikely(__pyx_t_5 == -1)) __PYX_ERR(0, 103, __pyx_L1_error) - __Pyx_DECREF(__pyx_t_11); __pyx_t_11 = 0; + break; } - __pyx_L21:; + __Pyx_GOTREF(__pyx_t_1); + } + __Pyx_XDECREF_SET(__pyx_v_tag_name, __pyx_t_1); + __pyx_t_1 = 0; - /* "gumbocy.pyx":65 - * tag_name = py_tag_name - * - * if self.has_attributes_whitelist: # <<<<<<<<<<<<<< + /* "gumbocy.pyx":196 * - * # Build a dict with all the whitelisted attributes + * for tag_name in options.get("tags_boilerplate", []): + * tag = gumbocy.gumbo_tag_enum(tag_name) # <<<<<<<<<<<<<< + * if tag != gumbocy.GUMBO_TAG_UNKNOWN: + * self.tags_boilerplate.insert( gumbocy.gumbo_tag_enum(tag_name)) */ - goto __pyx_L9; - } + __pyx_t_11 = __Pyx_PyObject_AsString(__pyx_v_tag_name); if (unlikely((!__pyx_t_11) && PyErr_Occurred())) __PYX_ERR(0, 196, __pyx_L1_error) + __pyx_v_tag = gumbo_tag_enum(__pyx_t_11); - /* "gumbocy.pyx":106 + /* "gumbocy.pyx":197 + * for tag_name in options.get("tags_boilerplate", []): + * tag = gumbocy.gumbo_tag_enum(tag_name) + * if tag != gumbocy.GUMBO_TAG_UNKNOWN: # <<<<<<<<<<<<<< + * self.tags_boilerplate.insert( gumbocy.gumbo_tag_enum(tag_name)) * - * else: - * self.nodes.append((level, tag_name)) # <<<<<<<<<<<<<< + */ + __pyx_t_2 = ((__pyx_v_tag != GUMBO_TAG_UNKNOWN) != 0); + if (__pyx_t_2) { + + /* "gumbocy.pyx":198 + * tag = gumbocy.gumbo_tag_enum(tag_name) + * if tag != gumbocy.GUMBO_TAG_UNKNOWN: + * self.tags_boilerplate.insert( gumbocy.gumbo_tag_enum(tag_name)) # <<<<<<<<<<<<<< + * + * for tag_name in options.get("tags_boilerplate_bypass", []): + */ + __pyx_t_11 = __Pyx_PyObject_AsString(__pyx_v_tag_name); if (unlikely((!__pyx_t_11) && PyErr_Occurred())) __PYX_ERR(0, 198, __pyx_L1_error) + __pyx_v_self->tags_boilerplate.insert(((int)gumbo_tag_enum(__pyx_t_11))); + + /* "gumbocy.pyx":197 + * for tag_name in options.get("tags_boilerplate", []): + * tag = gumbocy.gumbo_tag_enum(tag_name) + * if tag != gumbocy.GUMBO_TAG_UNKNOWN: # <<<<<<<<<<<<<< + * self.tags_boilerplate.insert( gumbocy.gumbo_tag_enum(tag_name)) * - * # Call _iternode() recursively for each of the children */ - /*else*/ { - if (unlikely(__pyx_v_self->nodes == Py_None)) { - PyErr_Format(PyExc_AttributeError, "'NoneType' object has no attribute '%s'", "append"); - __PYX_ERR(0, 106, __pyx_L1_error) - } - __pyx_t_11 = __Pyx_PyInt_From_int(__pyx_v_level); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 106, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_11); - __pyx_t_2 = __Pyx_PyBytes_FromString(__pyx_v_tag_name); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 106, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_2); - __pyx_t_4 = PyTuple_New(2); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 106, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_4); - __Pyx_GIVEREF(__pyx_t_11); - PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_11); - __Pyx_GIVEREF(__pyx_t_2); - PyTuple_SET_ITEM(__pyx_t_4, 1, __pyx_t_2); - __pyx_t_11 = 0; - __pyx_t_2 = 0; - __pyx_t_5 = __Pyx_PyList_Append(__pyx_v_self->nodes, __pyx_t_4); if (unlikely(__pyx_t_5 == -1)) __PYX_ERR(0, 106, __pyx_L1_error) - __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; } - __pyx_L9:; - /* "gumbocy.pyx":109 + /* "gumbocy.pyx":195 + * self.tags_ignore.insert( gumbocy.gumbo_tag_enum(tag_name)) * - * # Call _iternode() recursively for each of the children - * for i in range(node.v.element.children.length): # <<<<<<<<<<<<<< - * child = node.v.element.children.data[i] - * if self._traverse_node(level + 1, child) == 1: + * for tag_name in options.get("tags_boilerplate", []): # <<<<<<<<<<<<<< + * tag = gumbocy.gumbo_tag_enum(tag_name) + * if tag != gumbocy.GUMBO_TAG_UNKNOWN: */ - __pyx_t_9 = __pyx_v_node->v.element.children.length; - for (__pyx_t_10 = 0; __pyx_t_10 < __pyx_t_9; __pyx_t_10+=1) { - __pyx_v_i = __pyx_t_10; + } + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - /* "gumbocy.pyx":110 - * # Call _iternode() recursively for each of the children - * for i in range(node.v.element.children.length): - * child = node.v.element.children.data[i] # <<<<<<<<<<<<<< - * if self._traverse_node(level + 1, child) == 1: - * break + /* "gumbocy.pyx":200 + * self.tags_boilerplate.insert( gumbocy.gumbo_tag_enum(tag_name)) + * + * for tag_name in options.get("tags_boilerplate_bypass", []): # <<<<<<<<<<<<<< + * tag = gumbocy.gumbo_tag_enum(tag_name) + * if tag != gumbocy.GUMBO_TAG_UNKNOWN: */ - __pyx_v_child = ((GumboNode *)(__pyx_v_node->v.element.children.data[__pyx_v_i])); + if (unlikely(__pyx_v_options == Py_None)) { + PyErr_Format(PyExc_AttributeError, "'NoneType' object has no attribute '%s'", "get"); + __PYX_ERR(0, 200, __pyx_L1_error) + } + __pyx_t_3 = PyList_New(0); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 200, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __pyx_t_1 = __Pyx_PyDict_GetItemDefault(__pyx_v_options, __pyx_n_s_tags_boilerplate_bypass, __pyx_t_3); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 200, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + if (likely(PyList_CheckExact(__pyx_t_1)) || PyTuple_CheckExact(__pyx_t_1)) { + __pyx_t_3 = __pyx_t_1; __Pyx_INCREF(__pyx_t_3); __pyx_t_5 = 0; + __pyx_t_10 = NULL; + } else { + __pyx_t_5 = -1; __pyx_t_3 = PyObject_GetIter(__pyx_t_1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 200, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __pyx_t_10 = Py_TYPE(__pyx_t_3)->tp_iternext; if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 200, __pyx_L1_error) + } + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + for (;;) { + if (likely(!__pyx_t_10)) { + if (likely(PyList_CheckExact(__pyx_t_3))) { + if (__pyx_t_5 >= PyList_GET_SIZE(__pyx_t_3)) break; + #if CYTHON_COMPILING_IN_CPYTHON + __pyx_t_1 = PyList_GET_ITEM(__pyx_t_3, __pyx_t_5); __Pyx_INCREF(__pyx_t_1); __pyx_t_5++; if (unlikely(0 < 0)) __PYX_ERR(0, 200, __pyx_L1_error) + #else + __pyx_t_1 = PySequence_ITEM(__pyx_t_3, __pyx_t_5); __pyx_t_5++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 200, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + #endif + } else { + if (__pyx_t_5 >= PyTuple_GET_SIZE(__pyx_t_3)) break; + #if CYTHON_COMPILING_IN_CPYTHON + __pyx_t_1 = PyTuple_GET_ITEM(__pyx_t_3, __pyx_t_5); __Pyx_INCREF(__pyx_t_1); __pyx_t_5++; if (unlikely(0 < 0)) __PYX_ERR(0, 200, __pyx_L1_error) + #else + __pyx_t_1 = PySequence_ITEM(__pyx_t_3, __pyx_t_5); __pyx_t_5++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 200, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + #endif + } + } else { + __pyx_t_1 = __pyx_t_10(__pyx_t_3); + if (unlikely(!__pyx_t_1)) { + PyObject* exc_type = PyErr_Occurred(); + if (exc_type) { + if (likely(exc_type == PyExc_StopIteration || PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear(); + else __PYX_ERR(0, 200, __pyx_L1_error) + } + break; + } + __Pyx_GOTREF(__pyx_t_1); + } + __Pyx_XDECREF_SET(__pyx_v_tag_name, __pyx_t_1); + __pyx_t_1 = 0; - /* "gumbocy.pyx":111 - * for i in range(node.v.element.children.length): - * child = node.v.element.children.data[i] - * if self._traverse_node(level + 1, child) == 1: # <<<<<<<<<<<<<< - * break + /* "gumbocy.pyx":201 * + * for tag_name in options.get("tags_boilerplate_bypass", []): + * tag = gumbocy.gumbo_tag_enum(tag_name) # <<<<<<<<<<<<<< + * if tag != gumbocy.GUMBO_TAG_UNKNOWN: + * self.tags_boilerplate_bypass.insert( gumbocy.gumbo_tag_enum(tag_name)) */ - __pyx_t_1 = ((((struct __pyx_vtabstruct_7gumbocy_HTMLParser *)__pyx_v_self->__pyx_vtab)->_traverse_node(__pyx_v_self, (__pyx_v_level + 1), __pyx_v_child) == 1) != 0); - if (__pyx_t_1) { + __pyx_t_11 = __Pyx_PyObject_AsString(__pyx_v_tag_name); if (unlikely((!__pyx_t_11) && PyErr_Occurred())) __PYX_ERR(0, 201, __pyx_L1_error) + __pyx_v_tag = gumbo_tag_enum(__pyx_t_11); - /* "gumbocy.pyx":112 - * child = node.v.element.children.data[i] - * if self._traverse_node(level + 1, child) == 1: - * break # <<<<<<<<<<<<<< + /* "gumbocy.pyx":202 + * for tag_name in options.get("tags_boilerplate_bypass", []): + * tag = gumbocy.gumbo_tag_enum(tag_name) + * if tag != gumbocy.GUMBO_TAG_UNKNOWN: # <<<<<<<<<<<<<< + * self.tags_boilerplate_bypass.insert( gumbocy.gumbo_tag_enum(tag_name)) * - * if node.v.element.tag == gumbocy.GUMBO_TAG_HEAD and self.head_only: */ - goto __pyx_L27_break; + __pyx_t_2 = ((__pyx_v_tag != GUMBO_TAG_UNKNOWN) != 0); + if (__pyx_t_2) { - /* "gumbocy.pyx":111 - * for i in range(node.v.element.children.length): - * child = node.v.element.children.data[i] - * if self._traverse_node(level + 1, child) == 1: # <<<<<<<<<<<<<< - * break + /* "gumbocy.pyx":203 + * tag = gumbocy.gumbo_tag_enum(tag_name) + * if tag != gumbocy.GUMBO_TAG_UNKNOWN: + * self.tags_boilerplate_bypass.insert( gumbocy.gumbo_tag_enum(tag_name)) # <<<<<<<<<<<<<< + * + * for tag_name in options.get("tags_separators", []): + */ + __pyx_t_11 = __Pyx_PyObject_AsString(__pyx_v_tag_name); if (unlikely((!__pyx_t_11) && PyErr_Occurred())) __PYX_ERR(0, 203, __pyx_L1_error) + __pyx_v_self->tags_boilerplate_bypass.insert(((int)gumbo_tag_enum(__pyx_t_11))); + + /* "gumbocy.pyx":202 + * for tag_name in options.get("tags_boilerplate_bypass", []): + * tag = gumbocy.gumbo_tag_enum(tag_name) + * if tag != gumbocy.GUMBO_TAG_UNKNOWN: # <<<<<<<<<<<<<< + * self.tags_boilerplate_bypass.insert( gumbocy.gumbo_tag_enum(tag_name)) * */ - } } - __pyx_L27_break:; - /* "gumbocy.pyx":114 - * break + /* "gumbocy.pyx":200 + * self.tags_boilerplate.insert( gumbocy.gumbo_tag_enum(tag_name)) * - * if node.v.element.tag == gumbocy.GUMBO_TAG_HEAD and self.head_only: # <<<<<<<<<<<<<< - * return 1 + * for tag_name in options.get("tags_boilerplate_bypass", []): # <<<<<<<<<<<<<< + * tag = gumbocy.gumbo_tag_enum(tag_name) + * if tag != gumbocy.GUMBO_TAG_UNKNOWN: + */ + } + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + + /* "gumbocy.pyx":205 + * self.tags_boilerplate_bypass.insert( gumbocy.gumbo_tag_enum(tag_name)) * + * for tag_name in options.get("tags_separators", []): # <<<<<<<<<<<<<< + * tag = gumbocy.gumbo_tag_enum(tag_name) + * if tag != gumbocy.GUMBO_TAG_UNKNOWN: */ - __pyx_t_16 = ((__pyx_v_node->v.element.tag == GUMBO_TAG_HEAD) != 0); - if (__pyx_t_16) { + if (unlikely(__pyx_v_options == Py_None)) { + PyErr_Format(PyExc_AttributeError, "'NoneType' object has no attribute '%s'", "get"); + __PYX_ERR(0, 205, __pyx_L1_error) + } + __pyx_t_3 = PyList_New(0); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 205, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __pyx_t_1 = __Pyx_PyDict_GetItemDefault(__pyx_v_options, __pyx_n_s_tags_separators, __pyx_t_3); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 205, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + if (likely(PyList_CheckExact(__pyx_t_1)) || PyTuple_CheckExact(__pyx_t_1)) { + __pyx_t_3 = __pyx_t_1; __Pyx_INCREF(__pyx_t_3); __pyx_t_5 = 0; + __pyx_t_10 = NULL; + } else { + __pyx_t_5 = -1; __pyx_t_3 = PyObject_GetIter(__pyx_t_1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 205, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __pyx_t_10 = Py_TYPE(__pyx_t_3)->tp_iternext; if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 205, __pyx_L1_error) + } + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + for (;;) { + if (likely(!__pyx_t_10)) { + if (likely(PyList_CheckExact(__pyx_t_3))) { + if (__pyx_t_5 >= PyList_GET_SIZE(__pyx_t_3)) break; + #if CYTHON_COMPILING_IN_CPYTHON + __pyx_t_1 = PyList_GET_ITEM(__pyx_t_3, __pyx_t_5); __Pyx_INCREF(__pyx_t_1); __pyx_t_5++; if (unlikely(0 < 0)) __PYX_ERR(0, 205, __pyx_L1_error) + #else + __pyx_t_1 = PySequence_ITEM(__pyx_t_3, __pyx_t_5); __pyx_t_5++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 205, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + #endif + } else { + if (__pyx_t_5 >= PyTuple_GET_SIZE(__pyx_t_3)) break; + #if CYTHON_COMPILING_IN_CPYTHON + __pyx_t_1 = PyTuple_GET_ITEM(__pyx_t_3, __pyx_t_5); __Pyx_INCREF(__pyx_t_1); __pyx_t_5++; if (unlikely(0 < 0)) __PYX_ERR(0, 205, __pyx_L1_error) + #else + __pyx_t_1 = PySequence_ITEM(__pyx_t_3, __pyx_t_5); __pyx_t_5++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 205, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + #endif + } } else { - __pyx_t_1 = __pyx_t_16; - goto __pyx_L30_bool_binop_done; + __pyx_t_1 = __pyx_t_10(__pyx_t_3); + if (unlikely(!__pyx_t_1)) { + PyObject* exc_type = PyErr_Occurred(); + if (exc_type) { + if (likely(exc_type == PyExc_StopIteration || PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear(); + else __PYX_ERR(0, 205, __pyx_L1_error) + } + break; + } + __Pyx_GOTREF(__pyx_t_1); } - __pyx_t_16 = (__pyx_v_self->head_only != 0); - __pyx_t_1 = __pyx_t_16; - __pyx_L30_bool_binop_done:; - if (__pyx_t_1) { + __Pyx_XDECREF_SET(__pyx_v_tag_name, __pyx_t_1); + __pyx_t_1 = 0; - /* "gumbocy.pyx":115 + /* "gumbocy.pyx":206 * - * if node.v.element.tag == gumbocy.GUMBO_TAG_HEAD and self.head_only: - * return 1 # <<<<<<<<<<<<<< + * for tag_name in options.get("tags_separators", []): + * tag = gumbocy.gumbo_tag_enum(tag_name) # <<<<<<<<<<<<<< + * if tag != gumbocy.GUMBO_TAG_UNKNOWN: + * self.tags_separators.insert( gumbocy.gumbo_tag_enum(tag_name)) + */ + __pyx_t_11 = __Pyx_PyObject_AsString(__pyx_v_tag_name); if (unlikely((!__pyx_t_11) && PyErr_Occurred())) __PYX_ERR(0, 206, __pyx_L1_error) + __pyx_v_tag = gumbo_tag_enum(__pyx_t_11); + + /* "gumbocy.pyx":207 + * for tag_name in options.get("tags_separators", []): + * tag = gumbocy.gumbo_tag_enum(tag_name) + * if tag != gumbocy.GUMBO_TAG_UNKNOWN: # <<<<<<<<<<<<<< + * self.tags_separators.insert( gumbocy.gumbo_tag_enum(tag_name)) * - * return 0 */ - __pyx_r = 1; - goto __pyx_L0; + __pyx_t_2 = ((__pyx_v_tag != GUMBO_TAG_UNKNOWN) != 0); + if (__pyx_t_2) { - /* "gumbocy.pyx":114 - * break + /* "gumbocy.pyx":208 + * tag = gumbocy.gumbo_tag_enum(tag_name) + * if tag != gumbocy.GUMBO_TAG_UNKNOWN: + * self.tags_separators.insert( gumbocy.gumbo_tag_enum(tag_name)) # <<<<<<<<<<<<<< * - * if node.v.element.tag == gumbocy.GUMBO_TAG_HEAD and self.head_only: # <<<<<<<<<<<<<< - * return 1 + * self.tags_separators.insert(gumbocy.GUMBO_TAG_BODY) + */ + __pyx_t_11 = __Pyx_PyObject_AsString(__pyx_v_tag_name); if (unlikely((!__pyx_t_11) && PyErr_Occurred())) __PYX_ERR(0, 208, __pyx_L1_error) + __pyx_v_self->tags_separators.insert(((int)gumbo_tag_enum(__pyx_t_11))); + + /* "gumbocy.pyx":207 + * for tag_name in options.get("tags_separators", []): + * tag = gumbocy.gumbo_tag_enum(tag_name) + * if tag != gumbocy.GUMBO_TAG_UNKNOWN: # <<<<<<<<<<<<<< + * self.tags_separators.insert( gumbocy.gumbo_tag_enum(tag_name)) * */ } - /* "gumbocy.pyx":46 - * self.nodes.append((level, None, node.v.text.text)) - * - * elif node.type == gumbocy.GUMBO_NODE_ELEMENT: # <<<<<<<<<<<<<< + /* "gumbocy.pyx":205 + * self.tags_boilerplate_bypass.insert( gumbocy.gumbo_tag_enum(tag_name)) * - * tag_n = node.v.element.tag + * for tag_name in options.get("tags_separators", []): # <<<<<<<<<<<<<< + * tag = gumbocy.gumbo_tag_enum(tag_name) + * if tag != gumbocy.GUMBO_TAG_UNKNOWN: */ - break; - default: break; } + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - /* "gumbocy.pyx":117 - * return 1 + /* "gumbocy.pyx":210 + * self.tags_separators.insert( gumbocy.gumbo_tag_enum(tag_name)) * - * return 0 # <<<<<<<<<<<<<< + * self.tags_separators.insert(gumbocy.GUMBO_TAG_BODY) # <<<<<<<<<<<<<< * - * def parse(self): + * cdef bint guess_node_hidden(self, gumbocy.GumboNode* node, Attributes attrs): */ - __pyx_r = 0; - goto __pyx_L0; + __pyx_v_self->tags_separators.insert(GUMBO_TAG_BODY); - /* "gumbocy.pyx":35 - * self.html = html + /* "gumbocy.pyx":112 + * cdef list nodes * - * cdef bint _traverse_node(self, int level, gumbocy.GumboNode* node): # <<<<<<<<<<<<<< - * """ Traverses the node tree. Return 1 to stop at this level """ + * def __cinit__(self, dict options=None): # <<<<<<<<<<<<<< * + * options = options or {} */ /* function exit code */ + __pyx_r = 0; + goto __pyx_L0; __pyx_L1_error:; - __Pyx_XDECREF(__pyx_t_2); + __Pyx_XDECREF(__pyx_t_1); __Pyx_XDECREF(__pyx_t_3); - __Pyx_XDECREF(__pyx_t_4); - __Pyx_XDECREF(__pyx_t_11); - __Pyx_XDECREF(__pyx_t_12); - __Pyx_XDECREF(__pyx_t_13); - __Pyx_WriteUnraisable("gumbocy.HTMLParser._traverse_node", __pyx_clineno, __pyx_lineno, __pyx_filename, 0, 0); - __pyx_r = 0; + __Pyx_AddTraceback("gumbocy.HTMLParser.__cinit__", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = -1; __pyx_L0:; - __Pyx_XDECREF(__pyx_v_py_tag_name); - __Pyx_XDECREF(__pyx_v_attrs); - __Pyx_XDECREF(__pyx_v_attr_name); - __Pyx_XDECREF(__pyx_v_multiple_value); - __Pyx_XDECREF(__pyx_v_v); + __Pyx_XDECREF(__pyx_v_attributes_whitelist); + __Pyx_XDECREF(__pyx_v_classes_ignore); + __Pyx_XDECREF(__pyx_v_ids_ignore); + __Pyx_XDECREF(__pyx_v_classes_hidden); + __Pyx_XDECREF(__pyx_v_ids_hidden); + __Pyx_XDECREF(__pyx_v_classes_boilerplate); + __Pyx_XDECREF(__pyx_v_ids_boilerplate); + __Pyx_XDECREF(__pyx_v_roles_boilerplate); + __Pyx_XDECREF(__pyx_v_metas_whitelist); + __Pyx_XDECREF(__pyx_v_tag_name); + __Pyx_XDECREF(__pyx_v_options); __Pyx_RefNannyFinishContext(); return __pyx_r; } -/* "gumbocy.pyx":119 - * return 0 +/* "gumbocy.pyx":212 + * self.tags_separators.insert(gumbocy.GUMBO_TAG_BODY) + * + * cdef bint guess_node_hidden(self, gumbocy.GumboNode* node, Attributes attrs): # <<<<<<<<<<<<<< + * """ Rough guess to check if the element is explicitly hidden. * - * def parse(self): # <<<<<<<<<<<<<< - * """ Do the actual parsing of the HTML with gumbo """ - * self.output = gumbocy.gumbo_parse(self.html) */ -/* Python wrapper */ -static PyObject *__pyx_pw_7gumbocy_10HTMLParser_3parse(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused); /*proto*/ -static char __pyx_doc_7gumbocy_10HTMLParser_2parse[] = " Do the actual parsing of the HTML with gumbo "; -static PyObject *__pyx_pw_7gumbocy_10HTMLParser_3parse(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused) { - PyObject *__pyx_r = 0; +static int __pyx_f_7gumbocy_10HTMLParser_guess_node_hidden(struct __pyx_obj_7gumbocy_HTMLParser *__pyx_v_self, CYTHON_UNUSED GumboNode *__pyx_v_node, struct __pyx_obj_7gumbocy_Attributes *__pyx_v_attrs) { + PyObject *__pyx_v_k = NULL; + int __pyx_r; __Pyx_RefNannyDeclarations - __Pyx_RefNannySetupContext("parse (wrapper)", 0); - __pyx_r = __pyx_pf_7gumbocy_10HTMLParser_2parse(((struct __pyx_obj_7gumbocy_HTMLParser *)__pyx_v_self)); + int __pyx_t_1; + int __pyx_t_2; + PyObject *__pyx_t_3 = NULL; + PyObject *__pyx_t_4 = NULL; + char const *__pyx_t_5; + Py_ssize_t __pyx_t_6; + __Pyx_RefNannySetupContext("guess_node_hidden", 0); - /* function exit code */ - __Pyx_RefNannyFinishContext(); - return __pyx_r; -} + /* "gumbocy.pyx":218 + * """ + * + * if not self.has_attributes_whitelist: # <<<<<<<<<<<<<< + * return False + * + */ + __pyx_t_1 = ((!(__pyx_v_self->has_attributes_whitelist != 0)) != 0); + if (__pyx_t_1) { -static PyObject *__pyx_pf_7gumbocy_10HTMLParser_2parse(struct __pyx_obj_7gumbocy_HTMLParser *__pyx_v_self) { - PyObject *__pyx_r = NULL; - __Pyx_RefNannyDeclarations - __Pyx_RefNannySetupContext("parse", 0); + /* "gumbocy.pyx":219 + * + * if not self.has_attributes_whitelist: + * return False # <<<<<<<<<<<<<< + * + * # From the HTML5 spec + */ + __pyx_r = 0; + goto __pyx_L0; - /* "gumbocy.pyx":121 - * def parse(self): - * """ Do the actual parsing of the HTML with gumbo """ - * self.output = gumbocy.gumbo_parse(self.html) # <<<<<<<<<<<<<< + /* "gumbocy.pyx":218 + * """ + * + * if not self.has_attributes_whitelist: # <<<<<<<<<<<<<< + * return False * - * def listnodes(self, dict options=None): */ - __pyx_v_self->output = gumbo_parse(__pyx_v_self->html); + } - /* "gumbocy.pyx":119 - * return 0 + /* "gumbocy.pyx":222 + * + * # From the HTML5 spec + * if attrs.has_hidden: # <<<<<<<<<<<<<< + * return True * - * def parse(self): # <<<<<<<<<<<<<< - * """ Do the actual parsing of the HTML with gumbo """ - * self.output = gumbocy.gumbo_parse(self.html) */ + __pyx_t_1 = (__pyx_v_attrs->has_hidden != 0); + if (__pyx_t_1) { - /* function exit code */ - __pyx_r = Py_None; __Pyx_INCREF(Py_None); - __Pyx_XGIVEREF(__pyx_r); - __Pyx_RefNannyFinishContext(); - return __pyx_r; -} + /* "gumbocy.pyx":223 + * # From the HTML5 spec + * if attrs.has_hidden: + * return True # <<<<<<<<<<<<<< + * + * if self.has_ids_hidden and attrs.values.get(ATTR_ID): + */ + __pyx_r = 1; + goto __pyx_L0; -/* "gumbocy.pyx":123 - * self.output = gumbocy.gumbo_parse(self.html) + /* "gumbocy.pyx":222 * - * def listnodes(self, dict options=None): # <<<<<<<<<<<<<< - * """ Return the nodes as a flat list of tuples """ + * # From the HTML5 spec + * if attrs.has_hidden: # <<<<<<<<<<<<<< + * return True * */ + } -/* Python wrapper */ -static PyObject *__pyx_pw_7gumbocy_10HTMLParser_5listnodes(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/ -static char __pyx_doc_7gumbocy_10HTMLParser_4listnodes[] = " Return the nodes as a flat list of tuples "; -static PyObject *__pyx_pw_7gumbocy_10HTMLParser_5listnodes(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds) { - PyObject *__pyx_v_options = 0; - PyObject *__pyx_r = 0; - __Pyx_RefNannyDeclarations - __Pyx_RefNannySetupContext("listnodes (wrapper)", 0); - { - static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_options,0}; - PyObject* values[1] = {0}; - values[0] = ((PyObject*)Py_None); - if (unlikely(__pyx_kwds)) { - Py_ssize_t kw_args; - const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args); - switch (pos_args) { - case 1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0); - case 0: break; - default: goto __pyx_L5_argtuple_error; - } - kw_args = PyDict_Size(__pyx_kwds); - switch (pos_args) { - case 0: - if (kw_args > 0) { - PyObject* value = PyDict_GetItem(__pyx_kwds, __pyx_n_s_options); - if (value) { values[0] = value; kw_args--; } - } - } - if (unlikely(kw_args > 0)) { - if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "listnodes") < 0)) __PYX_ERR(0, 123, __pyx_L3_error) - } - } else { - switch (PyTuple_GET_SIZE(__pyx_args)) { - case 1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0); - case 0: break; - default: goto __pyx_L5_argtuple_error; + /* "gumbocy.pyx":225 + * return True + * + * if self.has_ids_hidden and attrs.values.get(ATTR_ID): # <<<<<<<<<<<<<< + * if re2_search(attrs.values[ATTR_ID], deref(self.ids_hidden)): + * return True + */ + __pyx_t_2 = (__pyx_v_self->has_ids_hidden != 0); + if (__pyx_t_2) { + } else { + __pyx_t_1 = __pyx_t_2; + goto __pyx_L6_bool_binop_done; + } + if (unlikely(__pyx_v_attrs->values == Py_None)) { + PyErr_Format(PyExc_AttributeError, "'NoneType' object has no attribute '%s'", "get"); + __PYX_ERR(0, 225, __pyx_L1_error) + } + __pyx_t_3 = __Pyx_PyInt_From___pyx_t_7gumbocy_AttributeNames(__pyx_e_7gumbocy_ATTR_ID); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 225, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __pyx_t_4 = __Pyx_PyDict_GetItemDefault(__pyx_v_attrs->values, __pyx_t_3, Py_None); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 225, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_4); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __pyx_t_2 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_2 < 0)) __PYX_ERR(0, 225, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; + __pyx_t_1 = __pyx_t_2; + __pyx_L6_bool_binop_done:; + if (__pyx_t_1) { + + /* "gumbocy.pyx":226 + * + * if self.has_ids_hidden and attrs.values.get(ATTR_ID): + * if re2_search(attrs.values[ATTR_ID], deref(self.ids_hidden)): # <<<<<<<<<<<<<< + * return True + * + */ + if (unlikely(__pyx_v_attrs->values == Py_None)) { + PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable"); + __PYX_ERR(0, 226, __pyx_L1_error) + } + __pyx_t_4 = __Pyx_PyInt_From___pyx_t_7gumbocy_AttributeNames(__pyx_e_7gumbocy_ATTR_ID); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 226, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_4); + __pyx_t_3 = __Pyx_PyDict_GetItem(__pyx_v_attrs->values, __pyx_t_4); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 226, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; + __pyx_t_5 = __Pyx_PyObject_AsString(__pyx_t_3); if (unlikely((!__pyx_t_5) && PyErr_Occurred())) __PYX_ERR(0, 226, __pyx_L1_error) + __pyx_t_1 = (__pyx_f_7gumbocy_re2_search(__pyx_t_5, (*__pyx_v_self->ids_hidden)) != 0); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + if (__pyx_t_1) { + + /* "gumbocy.pyx":227 + * if self.has_ids_hidden and attrs.values.get(ATTR_ID): + * if re2_search(attrs.values[ATTR_ID], deref(self.ids_hidden)): + * return True # <<<<<<<<<<<<<< + * + * if self.has_classes_hidden and attrs.size_classes > 0: + */ + __pyx_r = 1; + goto __pyx_L0; + + /* "gumbocy.pyx":226 + * + * if self.has_ids_hidden and attrs.values.get(ATTR_ID): + * if re2_search(attrs.values[ATTR_ID], deref(self.ids_hidden)): # <<<<<<<<<<<<<< + * return True + * + */ + } + + /* "gumbocy.pyx":225 + * return True + * + * if self.has_ids_hidden and attrs.values.get(ATTR_ID): # <<<<<<<<<<<<<< + * if re2_search(attrs.values[ATTR_ID], deref(self.ids_hidden)): + * return True + */ + } + + /* "gumbocy.pyx":229 + * return True + * + * if self.has_classes_hidden and attrs.size_classes > 0: # <<<<<<<<<<<<<< + * for k in attrs.classes: + * if re2_search(k, deref(self.classes_hidden)): + */ + __pyx_t_2 = (__pyx_v_self->has_classes_hidden != 0); + if (__pyx_t_2) { + } else { + __pyx_t_1 = __pyx_t_2; + goto __pyx_L10_bool_binop_done; + } + __pyx_t_2 = ((__pyx_v_attrs->size_classes > 0) != 0); + __pyx_t_1 = __pyx_t_2; + __pyx_L10_bool_binop_done:; + if (__pyx_t_1) { + + /* "gumbocy.pyx":230 + * + * if self.has_classes_hidden and attrs.size_classes > 0: + * for k in attrs.classes: # <<<<<<<<<<<<<< + * if re2_search(k, deref(self.classes_hidden)): + * return True + */ + if (unlikely(__pyx_v_attrs->classes == Py_None)) { + PyErr_SetString(PyExc_TypeError, "'NoneType' object is not iterable"); + __PYX_ERR(0, 230, __pyx_L1_error) + } + __pyx_t_3 = __pyx_v_attrs->classes; __Pyx_INCREF(__pyx_t_3); __pyx_t_6 = 0; + for (;;) { + if (__pyx_t_6 >= PyList_GET_SIZE(__pyx_t_3)) break; + #if CYTHON_COMPILING_IN_CPYTHON + __pyx_t_4 = PyList_GET_ITEM(__pyx_t_3, __pyx_t_6); __Pyx_INCREF(__pyx_t_4); __pyx_t_6++; if (unlikely(0 < 0)) __PYX_ERR(0, 230, __pyx_L1_error) + #else + __pyx_t_4 = PySequence_ITEM(__pyx_t_3, __pyx_t_6); __pyx_t_6++; if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 230, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_4); + #endif + __Pyx_XDECREF_SET(__pyx_v_k, __pyx_t_4); + __pyx_t_4 = 0; + + /* "gumbocy.pyx":231 + * if self.has_classes_hidden and attrs.size_classes > 0: + * for k in attrs.classes: + * if re2_search(k, deref(self.classes_hidden)): # <<<<<<<<<<<<<< + * return True + * + */ + __pyx_t_5 = __Pyx_PyObject_AsString(__pyx_v_k); if (unlikely((!__pyx_t_5) && PyErr_Occurred())) __PYX_ERR(0, 231, __pyx_L1_error) + __pyx_t_1 = (__pyx_f_7gumbocy_re2_search(__pyx_t_5, (*__pyx_v_self->classes_hidden)) != 0); + if (__pyx_t_1) { + + /* "gumbocy.pyx":232 + * for k in attrs.classes: + * if re2_search(k, deref(self.classes_hidden)): + * return True # <<<<<<<<<<<<<< + * + * if attrs.values.get(ATTR_STYLE): + */ + __pyx_r = 1; + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + goto __pyx_L0; + + /* "gumbocy.pyx":231 + * if self.has_classes_hidden and attrs.size_classes > 0: + * for k in attrs.classes: + * if re2_search(k, deref(self.classes_hidden)): # <<<<<<<<<<<<<< + * return True + * + */ } + + /* "gumbocy.pyx":230 + * + * if self.has_classes_hidden and attrs.size_classes > 0: + * for k in attrs.classes: # <<<<<<<<<<<<<< + * if re2_search(k, deref(self.classes_hidden)): + * return True + */ } - __pyx_v_options = ((PyObject*)values[0]); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + + /* "gumbocy.pyx":229 + * return True + * + * if self.has_classes_hidden and attrs.size_classes > 0: # <<<<<<<<<<<<<< + * for k in attrs.classes: + * if re2_search(k, deref(self.classes_hidden)): + */ } - goto __pyx_L4_argument_unpacking_done; - __pyx_L5_argtuple_error:; - __Pyx_RaiseArgtupleInvalid("listnodes", 0, 0, 1, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 123, __pyx_L3_error) - __pyx_L3_error:; - __Pyx_AddTraceback("gumbocy.HTMLParser.listnodes", __pyx_clineno, __pyx_lineno, __pyx_filename); - __Pyx_RefNannyFinishContext(); - return NULL; - __pyx_L4_argument_unpacking_done:; - if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_options), (&PyDict_Type), 1, "options", 1))) __PYX_ERR(0, 123, __pyx_L1_error) - __pyx_r = __pyx_pf_7gumbocy_10HTMLParser_4listnodes(((struct __pyx_obj_7gumbocy_HTMLParser *)__pyx_v_self), __pyx_v_options); - /* function exit code */ + /* "gumbocy.pyx":234 + * return True + * + * if attrs.values.get(ATTR_STYLE): # <<<<<<<<<<<<<< + * if re2_search(attrs.values[ATTR_STYLE], deref(_RE2_SEARCH_STYLE_HIDDEN)): + * return True + */ + if (unlikely(__pyx_v_attrs->values == Py_None)) { + PyErr_Format(PyExc_AttributeError, "'NoneType' object has no attribute '%s'", "get"); + __PYX_ERR(0, 234, __pyx_L1_error) + } + __pyx_t_3 = __Pyx_PyInt_From___pyx_t_7gumbocy_AttributeNames(__pyx_e_7gumbocy_ATTR_STYLE); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 234, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __pyx_t_4 = __Pyx_PyDict_GetItemDefault(__pyx_v_attrs->values, __pyx_t_3, Py_None); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 234, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_4); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __pyx_t_1 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_1 < 0)) __PYX_ERR(0, 234, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; + if (__pyx_t_1) { + + /* "gumbocy.pyx":235 + * + * if attrs.values.get(ATTR_STYLE): + * if re2_search(attrs.values[ATTR_STYLE], deref(_RE2_SEARCH_STYLE_HIDDEN)): # <<<<<<<<<<<<<< + * return True + * + */ + if (unlikely(__pyx_v_attrs->values == Py_None)) { + PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable"); + __PYX_ERR(0, 235, __pyx_L1_error) + } + __pyx_t_4 = __Pyx_PyInt_From___pyx_t_7gumbocy_AttributeNames(__pyx_e_7gumbocy_ATTR_STYLE); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 235, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_4); + __pyx_t_3 = __Pyx_PyDict_GetItem(__pyx_v_attrs->values, __pyx_t_4); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 235, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; + __pyx_t_5 = __Pyx_PyObject_AsString(__pyx_t_3); if (unlikely((!__pyx_t_5) && PyErr_Occurred())) __PYX_ERR(0, 235, __pyx_L1_error) + __pyx_t_1 = (__pyx_f_7gumbocy_re2_search(__pyx_t_5, (*__pyx_v_7gumbocy__RE2_SEARCH_STYLE_HIDDEN)) != 0); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + if (__pyx_t_1) { + + /* "gumbocy.pyx":236 + * if attrs.values.get(ATTR_STYLE): + * if re2_search(attrs.values[ATTR_STYLE], deref(_RE2_SEARCH_STYLE_HIDDEN)): + * return True # <<<<<<<<<<<<<< + * + * return False + */ + __pyx_r = 1; + goto __pyx_L0; + + /* "gumbocy.pyx":235 + * + * if attrs.values.get(ATTR_STYLE): + * if re2_search(attrs.values[ATTR_STYLE], deref(_RE2_SEARCH_STYLE_HIDDEN)): # <<<<<<<<<<<<<< + * return True + * + */ + } + + /* "gumbocy.pyx":234 + * return True + * + * if attrs.values.get(ATTR_STYLE): # <<<<<<<<<<<<<< + * if re2_search(attrs.values[ATTR_STYLE], deref(_RE2_SEARCH_STYLE_HIDDEN)): + * return True + */ + } + + /* "gumbocy.pyx":238 + * return True + * + * return False # <<<<<<<<<<<<<< + * + * + */ + __pyx_r = 0; goto __pyx_L0; + + /* "gumbocy.pyx":212 + * self.tags_separators.insert(gumbocy.GUMBO_TAG_BODY) + * + * cdef bint guess_node_hidden(self, gumbocy.GumboNode* node, Attributes attrs): # <<<<<<<<<<<<<< + * """ Rough guess to check if the element is explicitly hidden. + * + */ + + /* function exit code */ __pyx_L1_error:; - __pyx_r = NULL; + __Pyx_XDECREF(__pyx_t_3); + __Pyx_XDECREF(__pyx_t_4); + __Pyx_WriteUnraisable("gumbocy.HTMLParser.guess_node_hidden", __pyx_clineno, __pyx_lineno, __pyx_filename, 0, 0); + __pyx_r = 0; __pyx_L0:; + __Pyx_XDECREF(__pyx_v_k); __Pyx_RefNannyFinishContext(); return __pyx_r; } -static PyObject *__pyx_pf_7gumbocy_10HTMLParser_4listnodes(struct __pyx_obj_7gumbocy_HTMLParser *__pyx_v_self, PyObject *__pyx_v_options) { - PyObject *__pyx_v_tag_name = NULL; - GumboTag __pyx_v_tag; - PyObject *__pyx_r = NULL; +/* "gumbocy.pyx":241 + * + * + * cdef bint guess_node_boilerplate(self, gumbocy.GumboNode* node, Attributes attrs): # <<<<<<<<<<<<<< + * """ Rough guess to check if the element is boilerplate """ + * + */ + +static int __pyx_f_7gumbocy_10HTMLParser_guess_node_boilerplate(struct __pyx_obj_7gumbocy_HTMLParser *__pyx_v_self, GumboNode *__pyx_v_node, struct __pyx_obj_7gumbocy_Attributes *__pyx_v_attrs) { + PyObject *__pyx_v_k = NULL; + int __pyx_r; __Pyx_RefNannyDeclarations - PyObject *__pyx_t_1 = NULL; + int __pyx_t_1; int __pyx_t_2; PyObject *__pyx_t_3 = NULL; - int __pyx_t_4; - Py_ssize_t __pyx_t_5; - PyObject *(*__pyx_t_6)(PyObject *); - char const *__pyx_t_7; - __Pyx_RefNannySetupContext("listnodes", 0); - __Pyx_INCREF(__pyx_v_options); + Py_ssize_t __pyx_t_4; + PyObject *__pyx_t_5 = NULL; + char const *__pyx_t_6; + __Pyx_RefNannySetupContext("guess_node_boilerplate", 0); - /* "gumbocy.pyx":126 - * """ Return the nodes as a flat list of tuples """ + /* "gumbocy.pyx":244 + * """ Rough guess to check if the element is boilerplate """ + * + * if self.tags_boilerplate.count( node.v.element.tag): # <<<<<<<<<<<<<< + * return True * - * options = options or {} # <<<<<<<<<<<<<< - * self.nesting_limit = options.get("nesting_limit", 999) - * self.head_only = options.get("head_only") */ - __pyx_t_2 = __Pyx_PyObject_IsTrue(__pyx_v_options); if (unlikely(__pyx_t_2 < 0)) __PYX_ERR(0, 126, __pyx_L1_error) - if (!__pyx_t_2) { + __pyx_t_1 = (__pyx_v_self->tags_boilerplate.count(((int)__pyx_v_node->v.element.tag)) != 0); + if (__pyx_t_1) { + + /* "gumbocy.pyx":245 + * + * if self.tags_boilerplate.count( node.v.element.tag): + * return True # <<<<<<<<<<<<<< + * + * # http://html5doctor.com/understanding-aside/ + */ + __pyx_r = 1; + goto __pyx_L0; + + /* "gumbocy.pyx":244 + * """ Rough guess to check if the element is boilerplate """ + * + * if self.tags_boilerplate.count( node.v.element.tag): # <<<<<<<<<<<<<< + * return True + * + */ + } + + /* "gumbocy.pyx":248 + * + * # http://html5doctor.com/understanding-aside/ + * if node.v.element.tag == gumbocy.GUMBO_TAG_ASIDE: # <<<<<<<<<<<<<< + * if "article" not in self.current_stack: + * return True + */ + __pyx_t_1 = ((__pyx_v_node->v.element.tag == GUMBO_TAG_ASIDE) != 0); + if (__pyx_t_1) { + + /* "gumbocy.pyx":249 + * # http://html5doctor.com/understanding-aside/ + * if node.v.element.tag == gumbocy.GUMBO_TAG_ASIDE: + * if "article" not in self.current_stack: # <<<<<<<<<<<<<< + * return True + * + */ + __pyx_t_1 = (__Pyx_PySequence_ContainsTF(__pyx_n_s_article, __pyx_v_self->current_stack, Py_NE)); if (unlikely(__pyx_t_1 < 0)) __PYX_ERR(0, 249, __pyx_L1_error) + __pyx_t_2 = (__pyx_t_1 != 0); + if (__pyx_t_2) { + + /* "gumbocy.pyx":250 + * if node.v.element.tag == gumbocy.GUMBO_TAG_ASIDE: + * if "article" not in self.current_stack: + * return True # <<<<<<<<<<<<<< + * + * if self.has_classes_boilerplate and attrs.size_classes > 0: + */ + __pyx_r = 1; + goto __pyx_L0; + + /* "gumbocy.pyx":249 + * # http://html5doctor.com/understanding-aside/ + * if node.v.element.tag == gumbocy.GUMBO_TAG_ASIDE: + * if "article" not in self.current_stack: # <<<<<<<<<<<<<< + * return True + * + */ + } + + /* "gumbocy.pyx":248 + * + * # http://html5doctor.com/understanding-aside/ + * if node.v.element.tag == gumbocy.GUMBO_TAG_ASIDE: # <<<<<<<<<<<<<< + * if "article" not in self.current_stack: + * return True + */ + } + + /* "gumbocy.pyx":252 + * return True + * + * if self.has_classes_boilerplate and attrs.size_classes > 0: # <<<<<<<<<<<<<< + * for k in attrs.classes: + * if re2_search(k, deref(self.classes_boilerplate)): + */ + __pyx_t_1 = (__pyx_v_self->has_classes_boilerplate != 0); + if (__pyx_t_1) { } else { - __Pyx_INCREF(__pyx_v_options); - __pyx_t_1 = __pyx_v_options; - goto __pyx_L3_bool_binop_done; + __pyx_t_2 = __pyx_t_1; + goto __pyx_L7_bool_binop_done; } - __pyx_t_3 = PyDict_New(); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 126, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_3); - __Pyx_INCREF(__pyx_t_3); - __pyx_t_1 = __pyx_t_3; - __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - __pyx_L3_bool_binop_done:; - __Pyx_DECREF_SET(__pyx_v_options, ((PyObject*)__pyx_t_1)); - __pyx_t_1 = 0; + __pyx_t_1 = ((__pyx_v_attrs->size_classes > 0) != 0); + __pyx_t_2 = __pyx_t_1; + __pyx_L7_bool_binop_done:; + if (__pyx_t_2) { - /* "gumbocy.pyx":127 + /* "gumbocy.pyx":253 * - * options = options or {} - * self.nesting_limit = options.get("nesting_limit", 999) # <<<<<<<<<<<<<< - * self.head_only = options.get("head_only") + * if self.has_classes_boilerplate and attrs.size_classes > 0: + * for k in attrs.classes: # <<<<<<<<<<<<<< + * if re2_search(k, deref(self.classes_boilerplate)): + * return True + */ + if (unlikely(__pyx_v_attrs->classes == Py_None)) { + PyErr_SetString(PyExc_TypeError, "'NoneType' object is not iterable"); + __PYX_ERR(0, 253, __pyx_L1_error) + } + __pyx_t_3 = __pyx_v_attrs->classes; __Pyx_INCREF(__pyx_t_3); __pyx_t_4 = 0; + for (;;) { + if (__pyx_t_4 >= PyList_GET_SIZE(__pyx_t_3)) break; + #if CYTHON_COMPILING_IN_CPYTHON + __pyx_t_5 = PyList_GET_ITEM(__pyx_t_3, __pyx_t_4); __Pyx_INCREF(__pyx_t_5); __pyx_t_4++; if (unlikely(0 < 0)) __PYX_ERR(0, 253, __pyx_L1_error) + #else + __pyx_t_5 = PySequence_ITEM(__pyx_t_3, __pyx_t_4); __pyx_t_4++; if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 253, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_5); + #endif + __Pyx_XDECREF_SET(__pyx_v_k, __pyx_t_5); + __pyx_t_5 = 0; + + /* "gumbocy.pyx":254 + * if self.has_classes_boilerplate and attrs.size_classes > 0: + * for k in attrs.classes: + * if re2_search(k, deref(self.classes_boilerplate)): # <<<<<<<<<<<<<< + * return True * */ - if (unlikely(__pyx_v_options == Py_None)) { - PyErr_Format(PyExc_AttributeError, "'NoneType' object has no attribute '%s'", "get"); - __PYX_ERR(0, 127, __pyx_L1_error) + __pyx_t_6 = __Pyx_PyObject_AsString(__pyx_v_k); if (unlikely((!__pyx_t_6) && PyErr_Occurred())) __PYX_ERR(0, 254, __pyx_L1_error) + __pyx_t_2 = (__pyx_f_7gumbocy_re2_search(__pyx_t_6, (*__pyx_v_self->classes_boilerplate)) != 0); + if (__pyx_t_2) { + + /* "gumbocy.pyx":255 + * for k in attrs.classes: + * if re2_search(k, deref(self.classes_boilerplate)): + * return True # <<<<<<<<<<<<<< + * + * if self.has_ids_boilerplate and attrs.values.get(ATTR_ID): + */ + __pyx_r = 1; + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + goto __pyx_L0; + + /* "gumbocy.pyx":254 + * if self.has_classes_boilerplate and attrs.size_classes > 0: + * for k in attrs.classes: + * if re2_search(k, deref(self.classes_boilerplate)): # <<<<<<<<<<<<<< + * return True + * + */ + } + + /* "gumbocy.pyx":253 + * + * if self.has_classes_boilerplate and attrs.size_classes > 0: + * for k in attrs.classes: # <<<<<<<<<<<<<< + * if re2_search(k, deref(self.classes_boilerplate)): + * return True + */ + } + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + + /* "gumbocy.pyx":252 + * return True + * + * if self.has_classes_boilerplate and attrs.size_classes > 0: # <<<<<<<<<<<<<< + * for k in attrs.classes: + * if re2_search(k, deref(self.classes_boilerplate)): + */ } - __pyx_t_1 = __Pyx_PyDict_GetItemDefault(__pyx_v_options, __pyx_n_s_nesting_limit, __pyx_int_999); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 127, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_1); - __pyx_t_4 = __Pyx_PyInt_As_int(__pyx_t_1); if (unlikely((__pyx_t_4 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 127, __pyx_L1_error) - __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - __pyx_v_self->nesting_limit = __pyx_t_4; - /* "gumbocy.pyx":128 - * options = options or {} - * self.nesting_limit = options.get("nesting_limit", 999) - * self.head_only = options.get("head_only") # <<<<<<<<<<<<<< + /* "gumbocy.pyx":257 + * return True * - * self.has_classes_ignore = options.get("classes_ignore") + * if self.has_ids_boilerplate and attrs.values.get(ATTR_ID): # <<<<<<<<<<<<<< + * if re2_search(attrs.values[ATTR_ID], deref(self.ids_boilerplate)): + * return True + */ + __pyx_t_1 = (__pyx_v_self->has_ids_boilerplate != 0); + if (__pyx_t_1) { + } else { + __pyx_t_2 = __pyx_t_1; + goto __pyx_L13_bool_binop_done; + } + if (unlikely(__pyx_v_attrs->values == Py_None)) { + PyErr_Format(PyExc_AttributeError, "'NoneType' object has no attribute '%s'", "get"); + __PYX_ERR(0, 257, __pyx_L1_error) + } + __pyx_t_3 = __Pyx_PyInt_From___pyx_t_7gumbocy_AttributeNames(__pyx_e_7gumbocy_ATTR_ID); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 257, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __pyx_t_5 = __Pyx_PyDict_GetItemDefault(__pyx_v_attrs->values, __pyx_t_3, Py_None); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 257, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_5); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __pyx_t_1 = __Pyx_PyObject_IsTrue(__pyx_t_5); if (unlikely(__pyx_t_1 < 0)) __PYX_ERR(0, 257, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; + __pyx_t_2 = __pyx_t_1; + __pyx_L13_bool_binop_done:; + if (__pyx_t_2) { + + /* "gumbocy.pyx":258 + * + * if self.has_ids_boilerplate and attrs.values.get(ATTR_ID): + * if re2_search(attrs.values[ATTR_ID], deref(self.ids_boilerplate)): # <<<<<<<<<<<<<< + * return True + * + */ + if (unlikely(__pyx_v_attrs->values == Py_None)) { + PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable"); + __PYX_ERR(0, 258, __pyx_L1_error) + } + __pyx_t_5 = __Pyx_PyInt_From___pyx_t_7gumbocy_AttributeNames(__pyx_e_7gumbocy_ATTR_ID); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 258, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_5); + __pyx_t_3 = __Pyx_PyDict_GetItem(__pyx_v_attrs->values, __pyx_t_5); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 258, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; + __pyx_t_6 = __Pyx_PyObject_AsString(__pyx_t_3); if (unlikely((!__pyx_t_6) && PyErr_Occurred())) __PYX_ERR(0, 258, __pyx_L1_error) + __pyx_t_2 = (__pyx_f_7gumbocy_re2_search(__pyx_t_6, (*__pyx_v_self->ids_boilerplate)) != 0); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + if (__pyx_t_2) { + + /* "gumbocy.pyx":259 + * if self.has_ids_boilerplate and attrs.values.get(ATTR_ID): + * if re2_search(attrs.values[ATTR_ID], deref(self.ids_boilerplate)): + * return True # <<<<<<<<<<<<<< + * + * if self.has_roles_boilerplate and attrs.values.get(ATTR_ROLE): + */ + __pyx_r = 1; + goto __pyx_L0; + + /* "gumbocy.pyx":258 + * + * if self.has_ids_boilerplate and attrs.values.get(ATTR_ID): + * if re2_search(attrs.values[ATTR_ID], deref(self.ids_boilerplate)): # <<<<<<<<<<<<<< + * return True + * + */ + } + + /* "gumbocy.pyx":257 + * return True + * + * if self.has_ids_boilerplate and attrs.values.get(ATTR_ID): # <<<<<<<<<<<<<< + * if re2_search(attrs.values[ATTR_ID], deref(self.ids_boilerplate)): + * return True + */ + } + + /* "gumbocy.pyx":261 + * return True + * + * if self.has_roles_boilerplate and attrs.values.get(ATTR_ROLE): # <<<<<<<<<<<<<< + * if re2_search(attrs.values[ATTR_ROLE], deref(self.roles_boilerplate)): + * return True + */ + __pyx_t_1 = (__pyx_v_self->has_roles_boilerplate != 0); + if (__pyx_t_1) { + } else { + __pyx_t_2 = __pyx_t_1; + goto __pyx_L17_bool_binop_done; + } + if (unlikely(__pyx_v_attrs->values == Py_None)) { + PyErr_Format(PyExc_AttributeError, "'NoneType' object has no attribute '%s'", "get"); + __PYX_ERR(0, 261, __pyx_L1_error) + } + __pyx_t_3 = __Pyx_PyInt_From___pyx_t_7gumbocy_AttributeNames(__pyx_e_7gumbocy_ATTR_ROLE); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 261, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __pyx_t_5 = __Pyx_PyDict_GetItemDefault(__pyx_v_attrs->values, __pyx_t_3, Py_None); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 261, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_5); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __pyx_t_1 = __Pyx_PyObject_IsTrue(__pyx_t_5); if (unlikely(__pyx_t_1 < 0)) __PYX_ERR(0, 261, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; + __pyx_t_2 = __pyx_t_1; + __pyx_L17_bool_binop_done:; + if (__pyx_t_2) { + + /* "gumbocy.pyx":262 + * + * if self.has_roles_boilerplate and attrs.values.get(ATTR_ROLE): + * if re2_search(attrs.values[ATTR_ROLE], deref(self.roles_boilerplate)): # <<<<<<<<<<<<<< + * return True + * + */ + if (unlikely(__pyx_v_attrs->values == Py_None)) { + PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable"); + __PYX_ERR(0, 262, __pyx_L1_error) + } + __pyx_t_5 = __Pyx_PyInt_From___pyx_t_7gumbocy_AttributeNames(__pyx_e_7gumbocy_ATTR_ROLE); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 262, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_5); + __pyx_t_3 = __Pyx_PyDict_GetItem(__pyx_v_attrs->values, __pyx_t_5); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 262, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; + __pyx_t_6 = __Pyx_PyObject_AsString(__pyx_t_3); if (unlikely((!__pyx_t_6) && PyErr_Occurred())) __PYX_ERR(0, 262, __pyx_L1_error) + __pyx_t_2 = (__pyx_f_7gumbocy_re2_search(__pyx_t_6, (*__pyx_v_self->roles_boilerplate)) != 0); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + if (__pyx_t_2) { + + /* "gumbocy.pyx":263 + * if self.has_roles_boilerplate and attrs.values.get(ATTR_ROLE): + * if re2_search(attrs.values[ATTR_ROLE], deref(self.roles_boilerplate)): + * return True # <<<<<<<<<<<<<< + * + * return False + */ + __pyx_r = 1; + goto __pyx_L0; + + /* "gumbocy.pyx":262 + * + * if self.has_roles_boilerplate and attrs.values.get(ATTR_ROLE): + * if re2_search(attrs.values[ATTR_ROLE], deref(self.roles_boilerplate)): # <<<<<<<<<<<<<< + * return True + * + */ + } + + /* "gumbocy.pyx":261 + * return True + * + * if self.has_roles_boilerplate and attrs.values.get(ATTR_ROLE): # <<<<<<<<<<<<<< + * if re2_search(attrs.values[ATTR_ROLE], deref(self.roles_boilerplate)): + * return True + */ + } + + /* "gumbocy.pyx":265 + * return True + * + * return False # <<<<<<<<<<<<<< + * + * cdef Attributes get_attributes(self, gumbocy.GumboNode* node): + */ + __pyx_r = 0; + goto __pyx_L0; + + /* "gumbocy.pyx":241 + * + * + * cdef bint guess_node_boilerplate(self, gumbocy.GumboNode* node, Attributes attrs): # <<<<<<<<<<<<<< + * """ Rough guess to check if the element is boilerplate """ + * + */ + + /* function exit code */ + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_3); + __Pyx_XDECREF(__pyx_t_5); + __Pyx_WriteUnraisable("gumbocy.HTMLParser.guess_node_boilerplate", __pyx_clineno, __pyx_lineno, __pyx_filename, 0, 0); + __pyx_r = 0; + __pyx_L0:; + __Pyx_XDECREF(__pyx_v_k); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "gumbocy.pyx":267 + * return False + * + * cdef Attributes get_attributes(self, gumbocy.GumboNode* node): # <<<<<<<<<<<<<< + * """ Build a dict with all the whitelisted attributes """ + * + */ + +static struct __pyx_obj_7gumbocy_Attributes *__pyx_f_7gumbocy_10HTMLParser_get_attributes(struct __pyx_obj_7gumbocy_HTMLParser *__pyx_v_self, GumboNode *__pyx_v_node) { + struct __pyx_obj_7gumbocy_Attributes *__pyx_v_attrs = NULL; + unsigned int __pyx_v_i; + GumboAttribute *__pyx_v_attr; + PyObject *__pyx_v_multiple_value = NULL; + PyObject *__pyx_v_pystr = NULL; + struct __pyx_obj_7gumbocy_Attributes *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + PyObject *__pyx_t_1 = NULL; + unsigned int __pyx_t_2; + unsigned int __pyx_t_3; + int __pyx_t_4; + PyObject *__pyx_t_5 = NULL; + PyObject *__pyx_t_6 = NULL; + PyObject *__pyx_t_7 = NULL; + PyObject *__pyx_t_8 = NULL; + PyObject *__pyx_t_9 = NULL; + Py_ssize_t __pyx_t_10; + int __pyx_t_11; + __Pyx_RefNannySetupContext("get_attributes", 0); + + /* "gumbocy.pyx":270 + * """ Build a dict with all the whitelisted attributes """ + * + * attrs = Attributes() # <<<<<<<<<<<<<< + * # cdef Attributes attrs + * attrs.size_classes = 0 + */ + __pyx_t_1 = __Pyx_PyObject_Call(((PyObject *)__pyx_ptype_7gumbocy_Attributes), __pyx_empty_tuple, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 270, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __pyx_v_attrs = ((struct __pyx_obj_7gumbocy_Attributes *)__pyx_t_1); + __pyx_t_1 = 0; + + /* "gumbocy.pyx":272 + * attrs = Attributes() + * # cdef Attributes attrs + * attrs.size_classes = 0 # <<<<<<<<<<<<<< + * attrs.has_hidden = 0 + * # attrs.values = [""] * 10 + */ + __pyx_v_attrs->size_classes = 0; + + /* "gumbocy.pyx":273 + * # cdef Attributes attrs + * attrs.size_classes = 0 + * attrs.has_hidden = 0 # <<<<<<<<<<<<<< + * # attrs.values = [""] * 10 + * # attrs.classes = [] + */ + __pyx_v_attrs->has_hidden = 0; + + /* "gumbocy.pyx":276 + * # attrs.values = [""] * 10 + * # attrs.classes = [] + * attrs.values = {} # deref(new map[AttributeNames, const char*]()) # <<<<<<<<<<<<<< + * # attrs.values[ATTR_ID] = "x" + * # print dict(attrs.values) + */ + __pyx_t_1 = PyDict_New(); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 276, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_GIVEREF(__pyx_t_1); + __Pyx_GOTREF(__pyx_v_attrs->values); + __Pyx_DECREF(__pyx_v_attrs->values); + __pyx_v_attrs->values = ((PyObject*)__pyx_t_1); + __pyx_t_1 = 0; + + /* "gumbocy.pyx":280 + * # print dict(attrs.values) + * + * for i in range(node.v.element.attributes.length): # <<<<<<<<<<<<<< + * + * attr = node.v.element.attributes.data[i] + */ + __pyx_t_2 = __pyx_v_node->v.element.attributes.length; + for (__pyx_t_3 = 0; __pyx_t_3 < __pyx_t_2; __pyx_t_3+=1) { + __pyx_v_i = __pyx_t_3; + + /* "gumbocy.pyx":282 + * for i in range(node.v.element.attributes.length): + * + * attr = node.v.element.attributes.data[i] # <<<<<<<<<<<<<< + * + * if re2_search(attr.name, deref(self.attributes_whitelist)): + */ + __pyx_v_attr = ((GumboAttribute *)(__pyx_v_node->v.element.attributes.data[__pyx_v_i])); + + /* "gumbocy.pyx":284 + * attr = node.v.element.attributes.data[i] + * + * if re2_search(attr.name, deref(self.attributes_whitelist)): # <<<<<<<<<<<<<< + * + * if attr.name == b"class": + */ + __pyx_t_4 = (__pyx_f_7gumbocy_re2_search(__pyx_v_attr->name, (*__pyx_v_self->attributes_whitelist)) != 0); + if (__pyx_t_4) { + + /* "gumbocy.pyx":286 + * if re2_search(attr.name, deref(self.attributes_whitelist)): + * + * if attr.name == b"class": # <<<<<<<<<<<<<< + * multiple_value = frozenset(_RE_SPLIT_WHITESPACE.split(attr.value.strip().lower())) + * attrs.size_classes = len(multiple_value) + */ + __pyx_t_1 = __Pyx_PyBytes_FromString(__pyx_v_attr->name); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 286, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __pyx_t_4 = (__Pyx_PyBytes_Equals(__pyx_t_1, __pyx_n_b_class, Py_EQ)); if (unlikely(__pyx_t_4 < 0)) __PYX_ERR(0, 286, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + if (__pyx_t_4) { + + /* "gumbocy.pyx":287 + * + * if attr.name == b"class": + * multiple_value = frozenset(_RE_SPLIT_WHITESPACE.split(attr.value.strip().lower())) # <<<<<<<<<<<<<< + * attrs.size_classes = len(multiple_value) + * if attrs.size_classes > 0: + */ + __pyx_t_5 = __Pyx_GetModuleGlobalName(__pyx_n_s_RE_SPLIT_WHITESPACE); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 287, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_5); + __pyx_t_6 = __Pyx_PyObject_GetAttrStr(__pyx_t_5, __pyx_n_s_split); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 287, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_6); + __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; + __pyx_t_8 = __Pyx_PyBytes_FromString(__pyx_v_attr->value); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 287, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_8); + __pyx_t_9 = __Pyx_PyObject_GetAttrStr(__pyx_t_8, __pyx_n_s_strip); if (unlikely(!__pyx_t_9)) __PYX_ERR(0, 287, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_9); + __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0; + __pyx_t_8 = NULL; + if (CYTHON_COMPILING_IN_CPYTHON && likely(PyMethod_Check(__pyx_t_9))) { + __pyx_t_8 = PyMethod_GET_SELF(__pyx_t_9); + if (likely(__pyx_t_8)) { + PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_9); + __Pyx_INCREF(__pyx_t_8); + __Pyx_INCREF(function); + __Pyx_DECREF_SET(__pyx_t_9, function); + } + } + if (__pyx_t_8) { + __pyx_t_7 = __Pyx_PyObject_CallOneArg(__pyx_t_9, __pyx_t_8); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 287, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0; + } else { + __pyx_t_7 = __Pyx_PyObject_CallNoArg(__pyx_t_9); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 287, __pyx_L1_error) + } + __Pyx_GOTREF(__pyx_t_7); + __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0; + __pyx_t_9 = __Pyx_PyObject_GetAttrStr(__pyx_t_7, __pyx_n_s_lower); if (unlikely(!__pyx_t_9)) __PYX_ERR(0, 287, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_9); + __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0; + __pyx_t_7 = NULL; + if (CYTHON_COMPILING_IN_CPYTHON && likely(PyMethod_Check(__pyx_t_9))) { + __pyx_t_7 = PyMethod_GET_SELF(__pyx_t_9); + if (likely(__pyx_t_7)) { + PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_9); + __Pyx_INCREF(__pyx_t_7); + __Pyx_INCREF(function); + __Pyx_DECREF_SET(__pyx_t_9, function); + } + } + if (__pyx_t_7) { + __pyx_t_5 = __Pyx_PyObject_CallOneArg(__pyx_t_9, __pyx_t_7); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 287, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0; + } else { + __pyx_t_5 = __Pyx_PyObject_CallNoArg(__pyx_t_9); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 287, __pyx_L1_error) + } + __Pyx_GOTREF(__pyx_t_5); + __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0; + __pyx_t_9 = NULL; + if (CYTHON_COMPILING_IN_CPYTHON && unlikely(PyMethod_Check(__pyx_t_6))) { + __pyx_t_9 = PyMethod_GET_SELF(__pyx_t_6); + if (likely(__pyx_t_9)) { + PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_6); + __Pyx_INCREF(__pyx_t_9); + __Pyx_INCREF(function); + __Pyx_DECREF_SET(__pyx_t_6, function); + } + } + if (!__pyx_t_9) { + __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_6, __pyx_t_5); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 287, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; + __Pyx_GOTREF(__pyx_t_1); + } else { + __pyx_t_7 = PyTuple_New(1+1); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 287, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_7); + __Pyx_GIVEREF(__pyx_t_9); PyTuple_SET_ITEM(__pyx_t_7, 0, __pyx_t_9); __pyx_t_9 = NULL; + __Pyx_GIVEREF(__pyx_t_5); + PyTuple_SET_ITEM(__pyx_t_7, 0+1, __pyx_t_5); + __pyx_t_5 = 0; + __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_6, __pyx_t_7, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 287, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0; + } + __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; + __pyx_t_6 = __Pyx_PyFrozenSet_New(__pyx_t_1); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 287, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_6); + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __Pyx_XDECREF_SET(__pyx_v_multiple_value, ((PyObject*)__pyx_t_6)); + __pyx_t_6 = 0; + + /* "gumbocy.pyx":288 + * if attr.name == b"class": + * multiple_value = frozenset(_RE_SPLIT_WHITESPACE.split(attr.value.strip().lower())) + * attrs.size_classes = len(multiple_value) # <<<<<<<<<<<<<< + * if attrs.size_classes > 0: + * attrs.classes = list(multiple_value) + */ + __pyx_t_10 = PySet_GET_SIZE(__pyx_v_multiple_value); if (unlikely(__pyx_t_10 == -1)) __PYX_ERR(0, 288, __pyx_L1_error) + __pyx_v_attrs->size_classes = __pyx_t_10; + + /* "gumbocy.pyx":289 + * multiple_value = frozenset(_RE_SPLIT_WHITESPACE.split(attr.value.strip().lower())) + * attrs.size_classes = len(multiple_value) + * if attrs.size_classes > 0: # <<<<<<<<<<<<<< + * attrs.classes = list(multiple_value) + * # for k in multiple_value: + */ + __pyx_t_4 = ((__pyx_v_attrs->size_classes > 0) != 0); + if (__pyx_t_4) { + + /* "gumbocy.pyx":290 + * attrs.size_classes = len(multiple_value) + * if attrs.size_classes > 0: + * attrs.classes = list(multiple_value) # <<<<<<<<<<<<<< + * # for k in multiple_value: + * # ck = k + */ + __pyx_t_6 = PySequence_List(__pyx_v_multiple_value); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 290, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_6); + __Pyx_GIVEREF(__pyx_t_6); + __Pyx_GOTREF(__pyx_v_attrs->classes); + __Pyx_DECREF(__pyx_v_attrs->classes); + __pyx_v_attrs->classes = ((PyObject*)__pyx_t_6); + __pyx_t_6 = 0; + + /* "gumbocy.pyx":289 + * multiple_value = frozenset(_RE_SPLIT_WHITESPACE.split(attr.value.strip().lower())) + * attrs.size_classes = len(multiple_value) + * if attrs.size_classes > 0: # <<<<<<<<<<<<<< + * attrs.classes = list(multiple_value) + * # for k in multiple_value: + */ + } + + /* "gumbocy.pyx":286 + * if re2_search(attr.name, deref(self.attributes_whitelist)): + * + * if attr.name == b"class": # <<<<<<<<<<<<<< + * multiple_value = frozenset(_RE_SPLIT_WHITESPACE.split(attr.value.strip().lower())) + * attrs.size_classes = len(multiple_value) + */ + goto __pyx_L6; + } + + /* "gumbocy.pyx":295 + * # attrs.classes.push_back(ck) # = list(multiple_value) + * + * elif attr.name == b"id": # <<<<<<<<<<<<<< + * pystr = str(attr.value).lower() + * attrs.values[ATTR_ID] = pystr + */ + __pyx_t_6 = __Pyx_PyBytes_FromString(__pyx_v_attr->name); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 295, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_6); + __pyx_t_4 = (__Pyx_PyBytes_Equals(__pyx_t_6, __pyx_n_b_id, Py_EQ)); if (unlikely(__pyx_t_4 < 0)) __PYX_ERR(0, 295, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; + if (__pyx_t_4) { + + /* "gumbocy.pyx":296 + * + * elif attr.name == b"id": + * pystr = str(attr.value).lower() # <<<<<<<<<<<<<< + * attrs.values[ATTR_ID] = pystr + * + */ + __pyx_t_1 = __Pyx_PyBytes_FromString(__pyx_v_attr->value); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 296, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __pyx_t_7 = PyTuple_New(1); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 296, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_7); + __Pyx_GIVEREF(__pyx_t_1); + PyTuple_SET_ITEM(__pyx_t_7, 0, __pyx_t_1); + __pyx_t_1 = 0; + __pyx_t_1 = __Pyx_PyObject_Call(((PyObject *)(&PyString_Type)), __pyx_t_7, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 296, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0; + __pyx_t_7 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_lower); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 296, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_7); + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __pyx_t_1 = NULL; + if (CYTHON_COMPILING_IN_CPYTHON && likely(PyMethod_Check(__pyx_t_7))) { + __pyx_t_1 = PyMethod_GET_SELF(__pyx_t_7); + if (likely(__pyx_t_1)) { + PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_7); + __Pyx_INCREF(__pyx_t_1); + __Pyx_INCREF(function); + __Pyx_DECREF_SET(__pyx_t_7, function); + } + } + if (__pyx_t_1) { + __pyx_t_6 = __Pyx_PyObject_CallOneArg(__pyx_t_7, __pyx_t_1); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 296, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + } else { + __pyx_t_6 = __Pyx_PyObject_CallNoArg(__pyx_t_7); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 296, __pyx_L1_error) + } + __Pyx_GOTREF(__pyx_t_6); + __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0; + __Pyx_XDECREF_SET(__pyx_v_pystr, __pyx_t_6); + __pyx_t_6 = 0; + + /* "gumbocy.pyx":297 + * elif attr.name == b"id": + * pystr = str(attr.value).lower() + * attrs.values[ATTR_ID] = pystr # <<<<<<<<<<<<<< + * + * elif attr.name == b"style": + */ + if (unlikely(__pyx_v_attrs->values == Py_None)) { + PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable"); + __PYX_ERR(0, 297, __pyx_L1_error) + } + __pyx_t_6 = __Pyx_PyInt_From___pyx_t_7gumbocy_AttributeNames(__pyx_e_7gumbocy_ATTR_ID); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 297, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_6); + if (unlikely(PyDict_SetItem(__pyx_v_attrs->values, __pyx_t_6, __pyx_v_pystr) < 0)) __PYX_ERR(0, 297, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; + + /* "gumbocy.pyx":295 + * # attrs.classes.push_back(ck) # = list(multiple_value) + * + * elif attr.name == b"id": # <<<<<<<<<<<<<< + * pystr = str(attr.value).lower() + * attrs.values[ATTR_ID] = pystr + */ + goto __pyx_L6; + } + + /* "gumbocy.pyx":299 + * attrs.values[ATTR_ID] = pystr + * + * elif attr.name == b"style": # <<<<<<<<<<<<<< + * attrs.values[ATTR_STYLE] = attr.value + * + */ + __pyx_t_6 = __Pyx_PyBytes_FromString(__pyx_v_attr->name); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 299, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_6); + __pyx_t_4 = (__Pyx_PyBytes_Equals(__pyx_t_6, __pyx_n_b_style, Py_EQ)); if (unlikely(__pyx_t_4 < 0)) __PYX_ERR(0, 299, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; + if (__pyx_t_4) { + + /* "gumbocy.pyx":300 + * + * elif attr.name == b"style": + * attrs.values[ATTR_STYLE] = attr.value # <<<<<<<<<<<<<< + * + * elif attr.name == b"href": + */ + __pyx_t_6 = __Pyx_PyBytes_FromString(__pyx_v_attr->value); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 300, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_6); + if (unlikely(__pyx_v_attrs->values == Py_None)) { + PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable"); + __PYX_ERR(0, 300, __pyx_L1_error) + } + __pyx_t_7 = __Pyx_PyInt_From___pyx_t_7gumbocy_AttributeNames(__pyx_e_7gumbocy_ATTR_STYLE); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 300, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_7); + if (unlikely(PyDict_SetItem(__pyx_v_attrs->values, __pyx_t_7, __pyx_t_6) < 0)) __PYX_ERR(0, 300, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0; + __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; + + /* "gumbocy.pyx":299 + * attrs.values[ATTR_ID] = pystr + * + * elif attr.name == b"style": # <<<<<<<<<<<<<< + * attrs.values[ATTR_STYLE] = attr.value + * + */ + goto __pyx_L6; + } + + /* "gumbocy.pyx":302 + * attrs.values[ATTR_STYLE] = attr.value + * + * elif attr.name == b"href": # <<<<<<<<<<<<<< + * attrs.values[ATTR_HREF] = attr.value + * + */ + __pyx_t_6 = __Pyx_PyBytes_FromString(__pyx_v_attr->name); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 302, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_6); + __pyx_t_4 = (__Pyx_PyBytes_Equals(__pyx_t_6, __pyx_n_b_href, Py_EQ)); if (unlikely(__pyx_t_4 < 0)) __PYX_ERR(0, 302, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; + if (__pyx_t_4) { + + /* "gumbocy.pyx":303 + * + * elif attr.name == b"href": + * attrs.values[ATTR_HREF] = attr.value # <<<<<<<<<<<<<< + * + * elif attr.name == b"role": + */ + __pyx_t_6 = __Pyx_PyBytes_FromString(__pyx_v_attr->value); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 303, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_6); + if (unlikely(__pyx_v_attrs->values == Py_None)) { + PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable"); + __PYX_ERR(0, 303, __pyx_L1_error) + } + __pyx_t_7 = __Pyx_PyInt_From___pyx_t_7gumbocy_AttributeNames(__pyx_e_7gumbocy_ATTR_HREF); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 303, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_7); + if (unlikely(PyDict_SetItem(__pyx_v_attrs->values, __pyx_t_7, __pyx_t_6) < 0)) __PYX_ERR(0, 303, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0; + __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; + + /* "gumbocy.pyx":302 + * attrs.values[ATTR_STYLE] = attr.value + * + * elif attr.name == b"href": # <<<<<<<<<<<<<< + * attrs.values[ATTR_HREF] = attr.value + * + */ + goto __pyx_L6; + } + + /* "gumbocy.pyx":305 + * attrs.values[ATTR_HREF] = attr.value + * + * elif attr.name == b"role": # <<<<<<<<<<<<<< + * pystr = str(attr.value).lower() + * attrs.values[ATTR_ROLE] = pystr + */ + __pyx_t_6 = __Pyx_PyBytes_FromString(__pyx_v_attr->name); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 305, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_6); + __pyx_t_4 = (__Pyx_PyBytes_Equals(__pyx_t_6, __pyx_n_b_role, Py_EQ)); if (unlikely(__pyx_t_4 < 0)) __PYX_ERR(0, 305, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; + if (__pyx_t_4) { + + /* "gumbocy.pyx":306 + * + * elif attr.name == b"role": + * pystr = str(attr.value).lower() # <<<<<<<<<<<<<< + * attrs.values[ATTR_ROLE] = pystr + * + */ + __pyx_t_7 = __Pyx_PyBytes_FromString(__pyx_v_attr->value); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 306, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_7); + __pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 306, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_GIVEREF(__pyx_t_7); + PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_t_7); + __pyx_t_7 = 0; + __pyx_t_7 = __Pyx_PyObject_Call(((PyObject *)(&PyString_Type)), __pyx_t_1, NULL); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 306, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_7); + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_7, __pyx_n_s_lower); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 306, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0; + __pyx_t_7 = NULL; + if (CYTHON_COMPILING_IN_CPYTHON && likely(PyMethod_Check(__pyx_t_1))) { + __pyx_t_7 = PyMethod_GET_SELF(__pyx_t_1); + if (likely(__pyx_t_7)) { + PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_1); + __Pyx_INCREF(__pyx_t_7); + __Pyx_INCREF(function); + __Pyx_DECREF_SET(__pyx_t_1, function); + } + } + if (__pyx_t_7) { + __pyx_t_6 = __Pyx_PyObject_CallOneArg(__pyx_t_1, __pyx_t_7); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 306, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0; + } else { + __pyx_t_6 = __Pyx_PyObject_CallNoArg(__pyx_t_1); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 306, __pyx_L1_error) + } + __Pyx_GOTREF(__pyx_t_6); + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __Pyx_XDECREF_SET(__pyx_v_pystr, __pyx_t_6); + __pyx_t_6 = 0; + + /* "gumbocy.pyx":307 + * elif attr.name == b"role": + * pystr = str(attr.value).lower() + * attrs.values[ATTR_ROLE] = pystr # <<<<<<<<<<<<<< + * + * elif attr.name == b"rel": + */ + if (unlikely(__pyx_v_attrs->values == Py_None)) { + PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable"); + __PYX_ERR(0, 307, __pyx_L1_error) + } + __pyx_t_6 = __Pyx_PyInt_From___pyx_t_7gumbocy_AttributeNames(__pyx_e_7gumbocy_ATTR_ROLE); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 307, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_6); + if (unlikely(PyDict_SetItem(__pyx_v_attrs->values, __pyx_t_6, __pyx_v_pystr) < 0)) __PYX_ERR(0, 307, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; + + /* "gumbocy.pyx":305 + * attrs.values[ATTR_HREF] = attr.value + * + * elif attr.name == b"role": # <<<<<<<<<<<<<< + * pystr = str(attr.value).lower() + * attrs.values[ATTR_ROLE] = pystr + */ + goto __pyx_L6; + } + + /* "gumbocy.pyx":309 + * attrs.values[ATTR_ROLE] = pystr + * + * elif attr.name == b"rel": # <<<<<<<<<<<<<< + * pystr = str(attr.value).lower() + * attrs.values[ATTR_REL] = pystr + */ + __pyx_t_6 = __Pyx_PyBytes_FromString(__pyx_v_attr->name); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 309, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_6); + __pyx_t_4 = (__Pyx_PyBytes_Equals(__pyx_t_6, __pyx_n_b_rel, Py_EQ)); if (unlikely(__pyx_t_4 < 0)) __PYX_ERR(0, 309, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; + if (__pyx_t_4) { + + /* "gumbocy.pyx":310 + * + * elif attr.name == b"rel": + * pystr = str(attr.value).lower() # <<<<<<<<<<<<<< + * attrs.values[ATTR_REL] = pystr + * + */ + __pyx_t_1 = __Pyx_PyBytes_FromString(__pyx_v_attr->value); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 310, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __pyx_t_7 = PyTuple_New(1); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 310, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_7); + __Pyx_GIVEREF(__pyx_t_1); + PyTuple_SET_ITEM(__pyx_t_7, 0, __pyx_t_1); + __pyx_t_1 = 0; + __pyx_t_1 = __Pyx_PyObject_Call(((PyObject *)(&PyString_Type)), __pyx_t_7, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 310, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0; + __pyx_t_7 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_lower); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 310, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_7); + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __pyx_t_1 = NULL; + if (CYTHON_COMPILING_IN_CPYTHON && likely(PyMethod_Check(__pyx_t_7))) { + __pyx_t_1 = PyMethod_GET_SELF(__pyx_t_7); + if (likely(__pyx_t_1)) { + PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_7); + __Pyx_INCREF(__pyx_t_1); + __Pyx_INCREF(function); + __Pyx_DECREF_SET(__pyx_t_7, function); + } + } + if (__pyx_t_1) { + __pyx_t_6 = __Pyx_PyObject_CallOneArg(__pyx_t_7, __pyx_t_1); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 310, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + } else { + __pyx_t_6 = __Pyx_PyObject_CallNoArg(__pyx_t_7); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 310, __pyx_L1_error) + } + __Pyx_GOTREF(__pyx_t_6); + __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0; + __Pyx_XDECREF_SET(__pyx_v_pystr, __pyx_t_6); + __pyx_t_6 = 0; + + /* "gumbocy.pyx":311 + * elif attr.name == b"rel": + * pystr = str(attr.value).lower() + * attrs.values[ATTR_REL] = pystr # <<<<<<<<<<<<<< + * + * elif attr.name == b"aria-hidden" and attr.value == b"true": + */ + if (unlikely(__pyx_v_attrs->values == Py_None)) { + PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable"); + __PYX_ERR(0, 311, __pyx_L1_error) + } + __pyx_t_6 = __Pyx_PyInt_From___pyx_t_7gumbocy_AttributeNames(__pyx_e_7gumbocy_ATTR_REL); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 311, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_6); + if (unlikely(PyDict_SetItem(__pyx_v_attrs->values, __pyx_t_6, __pyx_v_pystr) < 0)) __PYX_ERR(0, 311, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; + + /* "gumbocy.pyx":309 + * attrs.values[ATTR_ROLE] = pystr + * + * elif attr.name == b"rel": # <<<<<<<<<<<<<< + * pystr = str(attr.value).lower() + * attrs.values[ATTR_REL] = pystr + */ + goto __pyx_L6; + } + + /* "gumbocy.pyx":313 + * attrs.values[ATTR_REL] = pystr + * + * elif attr.name == b"aria-hidden" and attr.value == b"true": # <<<<<<<<<<<<<< + * attrs.has_hidden = 1 + * + */ + __pyx_t_6 = __Pyx_PyBytes_FromString(__pyx_v_attr->name); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 313, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_6); + __pyx_t_11 = (__Pyx_PyBytes_Equals(__pyx_t_6, __pyx_kp_b_aria_hidden, Py_EQ)); if (unlikely(__pyx_t_11 < 0)) __PYX_ERR(0, 313, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; + if (__pyx_t_11) { + } else { + __pyx_t_4 = __pyx_t_11; + goto __pyx_L8_bool_binop_done; + } + __pyx_t_6 = __Pyx_PyBytes_FromString(__pyx_v_attr->value); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 313, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_6); + __pyx_t_11 = (__Pyx_PyBytes_Equals(__pyx_t_6, __pyx_n_b_true, Py_EQ)); if (unlikely(__pyx_t_11 < 0)) __PYX_ERR(0, 313, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; + __pyx_t_4 = __pyx_t_11; + __pyx_L8_bool_binop_done:; + if (__pyx_t_4) { + + /* "gumbocy.pyx":314 + * + * elif attr.name == b"aria-hidden" and attr.value == b"true": + * attrs.has_hidden = 1 # <<<<<<<<<<<<<< + * + * elif attr.name == b"hidden": + */ + __pyx_v_attrs->has_hidden = 1; + + /* "gumbocy.pyx":313 + * attrs.values[ATTR_REL] = pystr + * + * elif attr.name == b"aria-hidden" and attr.value == b"true": # <<<<<<<<<<<<<< + * attrs.has_hidden = 1 + * + */ + goto __pyx_L6; + } + + /* "gumbocy.pyx":316 + * attrs.has_hidden = 1 + * + * elif attr.name == b"hidden": # <<<<<<<<<<<<<< + * attrs.has_hidden = 1 + * + */ + __pyx_t_6 = __Pyx_PyBytes_FromString(__pyx_v_attr->name); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 316, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_6); + __pyx_t_4 = (__Pyx_PyBytes_Equals(__pyx_t_6, __pyx_n_b_hidden, Py_EQ)); if (unlikely(__pyx_t_4 < 0)) __PYX_ERR(0, 316, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; + if (__pyx_t_4) { + + /* "gumbocy.pyx":317 + * + * elif attr.name == b"hidden": + * attrs.has_hidden = 1 # <<<<<<<<<<<<<< + * + * elif attr.name == b"alt": + */ + __pyx_v_attrs->has_hidden = 1; + + /* "gumbocy.pyx":316 + * attrs.has_hidden = 1 + * + * elif attr.name == b"hidden": # <<<<<<<<<<<<<< + * attrs.has_hidden = 1 + * + */ + goto __pyx_L6; + } + + /* "gumbocy.pyx":319 + * attrs.has_hidden = 1 + * + * elif attr.name == b"alt": # <<<<<<<<<<<<<< + * attrs.values[ATTR_ALT] = attr.value + * + */ + __pyx_t_6 = __Pyx_PyBytes_FromString(__pyx_v_attr->name); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 319, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_6); + __pyx_t_4 = (__Pyx_PyBytes_Equals(__pyx_t_6, __pyx_n_b_alt, Py_EQ)); if (unlikely(__pyx_t_4 < 0)) __PYX_ERR(0, 319, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; + if (__pyx_t_4) { + + /* "gumbocy.pyx":320 + * + * elif attr.name == b"alt": + * attrs.values[ATTR_ALT] = attr.value # <<<<<<<<<<<<<< + * + * elif attr.name == b"src": + */ + __pyx_t_6 = __Pyx_PyBytes_FromString(__pyx_v_attr->value); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 320, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_6); + if (unlikely(__pyx_v_attrs->values == Py_None)) { + PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable"); + __PYX_ERR(0, 320, __pyx_L1_error) + } + __pyx_t_7 = __Pyx_PyInt_From___pyx_t_7gumbocy_AttributeNames(__pyx_e_7gumbocy_ATTR_ALT); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 320, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_7); + if (unlikely(PyDict_SetItem(__pyx_v_attrs->values, __pyx_t_7, __pyx_t_6) < 0)) __PYX_ERR(0, 320, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0; + __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; + + /* "gumbocy.pyx":319 + * attrs.has_hidden = 1 + * + * elif attr.name == b"alt": # <<<<<<<<<<<<<< + * attrs.values[ATTR_ALT] = attr.value + * + */ + goto __pyx_L6; + } + + /* "gumbocy.pyx":322 + * attrs.values[ATTR_ALT] = attr.value + * + * elif attr.name == b"src": # <<<<<<<<<<<<<< + * attrs.values[ATTR_SRC] = attr.value + * + */ + __pyx_t_6 = __Pyx_PyBytes_FromString(__pyx_v_attr->name); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 322, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_6); + __pyx_t_4 = (__Pyx_PyBytes_Equals(__pyx_t_6, __pyx_n_b_src, Py_EQ)); if (unlikely(__pyx_t_4 < 0)) __PYX_ERR(0, 322, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; + if (__pyx_t_4) { + + /* "gumbocy.pyx":323 + * + * elif attr.name == b"src": + * attrs.values[ATTR_SRC] = attr.value # <<<<<<<<<<<<<< + * + * elif attr.name == b"name": + */ + __pyx_t_6 = __Pyx_PyBytes_FromString(__pyx_v_attr->value); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 323, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_6); + if (unlikely(__pyx_v_attrs->values == Py_None)) { + PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable"); + __PYX_ERR(0, 323, __pyx_L1_error) + } + __pyx_t_7 = __Pyx_PyInt_From___pyx_t_7gumbocy_AttributeNames(__pyx_e_7gumbocy_ATTR_SRC); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 323, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_7); + if (unlikely(PyDict_SetItem(__pyx_v_attrs->values, __pyx_t_7, __pyx_t_6) < 0)) __PYX_ERR(0, 323, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0; + __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; + + /* "gumbocy.pyx":322 + * attrs.values[ATTR_ALT] = attr.value + * + * elif attr.name == b"src": # <<<<<<<<<<<<<< + * attrs.values[ATTR_SRC] = attr.value + * + */ + goto __pyx_L6; + } + + /* "gumbocy.pyx":325 + * attrs.values[ATTR_SRC] = attr.value + * + * elif attr.name == b"name": # <<<<<<<<<<<<<< + * pystr = str(attr.value).lower() + * attrs.values[ATTR_NAME] = pystr + */ + __pyx_t_6 = __Pyx_PyBytes_FromString(__pyx_v_attr->name); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 325, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_6); + __pyx_t_4 = (__Pyx_PyBytes_Equals(__pyx_t_6, __pyx_n_b_name, Py_EQ)); if (unlikely(__pyx_t_4 < 0)) __PYX_ERR(0, 325, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; + if (__pyx_t_4) { + + /* "gumbocy.pyx":326 + * + * elif attr.name == b"name": + * pystr = str(attr.value).lower() # <<<<<<<<<<<<<< + * attrs.values[ATTR_NAME] = pystr + * + */ + __pyx_t_7 = __Pyx_PyBytes_FromString(__pyx_v_attr->value); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 326, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_7); + __pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 326, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_GIVEREF(__pyx_t_7); + PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_t_7); + __pyx_t_7 = 0; + __pyx_t_7 = __Pyx_PyObject_Call(((PyObject *)(&PyString_Type)), __pyx_t_1, NULL); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 326, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_7); + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_7, __pyx_n_s_lower); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 326, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0; + __pyx_t_7 = NULL; + if (CYTHON_COMPILING_IN_CPYTHON && likely(PyMethod_Check(__pyx_t_1))) { + __pyx_t_7 = PyMethod_GET_SELF(__pyx_t_1); + if (likely(__pyx_t_7)) { + PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_1); + __Pyx_INCREF(__pyx_t_7); + __Pyx_INCREF(function); + __Pyx_DECREF_SET(__pyx_t_1, function); + } + } + if (__pyx_t_7) { + __pyx_t_6 = __Pyx_PyObject_CallOneArg(__pyx_t_1, __pyx_t_7); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 326, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0; + } else { + __pyx_t_6 = __Pyx_PyObject_CallNoArg(__pyx_t_1); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 326, __pyx_L1_error) + } + __Pyx_GOTREF(__pyx_t_6); + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __Pyx_XDECREF_SET(__pyx_v_pystr, __pyx_t_6); + __pyx_t_6 = 0; + + /* "gumbocy.pyx":327 + * elif attr.name == b"name": + * pystr = str(attr.value).lower() + * attrs.values[ATTR_NAME] = pystr # <<<<<<<<<<<<<< + * + * elif attr.name == b"property": + */ + if (unlikely(__pyx_v_attrs->values == Py_None)) { + PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable"); + __PYX_ERR(0, 327, __pyx_L1_error) + } + __pyx_t_6 = __Pyx_PyInt_From___pyx_t_7gumbocy_AttributeNames(__pyx_e_7gumbocy_ATTR_NAME); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 327, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_6); + if (unlikely(PyDict_SetItem(__pyx_v_attrs->values, __pyx_t_6, __pyx_v_pystr) < 0)) __PYX_ERR(0, 327, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; + + /* "gumbocy.pyx":325 + * attrs.values[ATTR_SRC] = attr.value + * + * elif attr.name == b"name": # <<<<<<<<<<<<<< + * pystr = str(attr.value).lower() + * attrs.values[ATTR_NAME] = pystr + */ + goto __pyx_L6; + } + + /* "gumbocy.pyx":329 + * attrs.values[ATTR_NAME] = pystr + * + * elif attr.name == b"property": # <<<<<<<<<<<<<< + * pystr = str(attr.value).lower() + * attrs.values[ATTR_PROPERTY] = pystr + */ + __pyx_t_6 = __Pyx_PyBytes_FromString(__pyx_v_attr->name); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 329, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_6); + __pyx_t_4 = (__Pyx_PyBytes_Equals(__pyx_t_6, __pyx_n_b_property, Py_EQ)); if (unlikely(__pyx_t_4 < 0)) __PYX_ERR(0, 329, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; + if (__pyx_t_4) { + + /* "gumbocy.pyx":330 + * + * elif attr.name == b"property": + * pystr = str(attr.value).lower() # <<<<<<<<<<<<<< + * attrs.values[ATTR_PROPERTY] = pystr + * + */ + __pyx_t_1 = __Pyx_PyBytes_FromString(__pyx_v_attr->value); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 330, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __pyx_t_7 = PyTuple_New(1); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 330, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_7); + __Pyx_GIVEREF(__pyx_t_1); + PyTuple_SET_ITEM(__pyx_t_7, 0, __pyx_t_1); + __pyx_t_1 = 0; + __pyx_t_1 = __Pyx_PyObject_Call(((PyObject *)(&PyString_Type)), __pyx_t_7, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 330, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0; + __pyx_t_7 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_lower); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 330, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_7); + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __pyx_t_1 = NULL; + if (CYTHON_COMPILING_IN_CPYTHON && likely(PyMethod_Check(__pyx_t_7))) { + __pyx_t_1 = PyMethod_GET_SELF(__pyx_t_7); + if (likely(__pyx_t_1)) { + PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_7); + __Pyx_INCREF(__pyx_t_1); + __Pyx_INCREF(function); + __Pyx_DECREF_SET(__pyx_t_7, function); + } + } + if (__pyx_t_1) { + __pyx_t_6 = __Pyx_PyObject_CallOneArg(__pyx_t_7, __pyx_t_1); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 330, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + } else { + __pyx_t_6 = __Pyx_PyObject_CallNoArg(__pyx_t_7); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 330, __pyx_L1_error) + } + __Pyx_GOTREF(__pyx_t_6); + __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0; + __Pyx_XDECREF_SET(__pyx_v_pystr, __pyx_t_6); + __pyx_t_6 = 0; + + /* "gumbocy.pyx":331 + * elif attr.name == b"property": + * pystr = str(attr.value).lower() + * attrs.values[ATTR_PROPERTY] = pystr # <<<<<<<<<<<<<< + * + * elif attr.name == b"content": + */ + if (unlikely(__pyx_v_attrs->values == Py_None)) { + PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable"); + __PYX_ERR(0, 331, __pyx_L1_error) + } + __pyx_t_6 = __Pyx_PyInt_From___pyx_t_7gumbocy_AttributeNames(__pyx_e_7gumbocy_ATTR_PROPERTY); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 331, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_6); + if (unlikely(PyDict_SetItem(__pyx_v_attrs->values, __pyx_t_6, __pyx_v_pystr) < 0)) __PYX_ERR(0, 331, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; + + /* "gumbocy.pyx":329 + * attrs.values[ATTR_NAME] = pystr + * + * elif attr.name == b"property": # <<<<<<<<<<<<<< + * pystr = str(attr.value).lower() + * attrs.values[ATTR_PROPERTY] = pystr + */ + goto __pyx_L6; + } + + /* "gumbocy.pyx":333 + * attrs.values[ATTR_PROPERTY] = pystr + * + * elif attr.name == b"content": # <<<<<<<<<<<<<< + * attrs.values[ATTR_CONTENT] = attr.value + * + */ + __pyx_t_6 = __Pyx_PyBytes_FromString(__pyx_v_attr->name); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 333, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_6); + __pyx_t_4 = (__Pyx_PyBytes_Equals(__pyx_t_6, __pyx_n_b_content, Py_EQ)); if (unlikely(__pyx_t_4 < 0)) __PYX_ERR(0, 333, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; + if (__pyx_t_4) { + + /* "gumbocy.pyx":334 + * + * elif attr.name == b"content": + * attrs.values[ATTR_CONTENT] = attr.value # <<<<<<<<<<<<<< + * + * return attrs + */ + __pyx_t_6 = __Pyx_PyBytes_FromString(__pyx_v_attr->value); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 334, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_6); + if (unlikely(__pyx_v_attrs->values == Py_None)) { + PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable"); + __PYX_ERR(0, 334, __pyx_L1_error) + } + __pyx_t_7 = __Pyx_PyInt_From___pyx_t_7gumbocy_AttributeNames(__pyx_e_7gumbocy_ATTR_CONTENT); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 334, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_7); + if (unlikely(PyDict_SetItem(__pyx_v_attrs->values, __pyx_t_7, __pyx_t_6) < 0)) __PYX_ERR(0, 334, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0; + __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; + + /* "gumbocy.pyx":333 + * attrs.values[ATTR_PROPERTY] = pystr + * + * elif attr.name == b"content": # <<<<<<<<<<<<<< + * attrs.values[ATTR_CONTENT] = attr.value + * + */ + } + __pyx_L6:; + + /* "gumbocy.pyx":284 + * attr = node.v.element.attributes.data[i] + * + * if re2_search(attr.name, deref(self.attributes_whitelist)): # <<<<<<<<<<<<<< + * + * if attr.name == b"class": + */ + } + } + + /* "gumbocy.pyx":336 + * attrs.values[ATTR_CONTENT] = attr.value + * + * return attrs # <<<<<<<<<<<<<< + * + * cdef void close_word_group(self): + */ + __Pyx_XDECREF(((PyObject *)__pyx_r)); + __Pyx_INCREF(((PyObject *)__pyx_v_attrs)); + __pyx_r = __pyx_v_attrs; + goto __pyx_L0; + + /* "gumbocy.pyx":267 + * return False + * + * cdef Attributes get_attributes(self, gumbocy.GumboNode* node): # <<<<<<<<<<<<<< + * """ Build a dict with all the whitelisted attributes """ + * + */ + + /* function exit code */ + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_1); + __Pyx_XDECREF(__pyx_t_5); + __Pyx_XDECREF(__pyx_t_6); + __Pyx_XDECREF(__pyx_t_7); + __Pyx_XDECREF(__pyx_t_8); + __Pyx_XDECREF(__pyx_t_9); + __Pyx_AddTraceback("gumbocy.HTMLParser.get_attributes", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = 0; + __pyx_L0:; + __Pyx_XDECREF((PyObject *)__pyx_v_attrs); + __Pyx_XDECREF(__pyx_v_multiple_value); + __Pyx_XDECREF(__pyx_v_pystr); + __Pyx_XGIVEREF((PyObject *)__pyx_r); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "gumbocy.pyx":338 + * return attrs + * + * cdef void close_word_group(self): # <<<<<<<<<<<<<< + * """ Close the current word group """ + * + */ + +static void __pyx_f_7gumbocy_10HTMLParser_close_word_group(struct __pyx_obj_7gumbocy_HTMLParser *__pyx_v_self) { + __Pyx_RefNannyDeclarations + int __pyx_t_1; + PyObject *__pyx_t_2 = NULL; + PyObject *__pyx_t_3 = NULL; + int __pyx_t_4; + __Pyx_RefNannySetupContext("close_word_group", 0); + + /* "gumbocy.pyx":341 + * """ Close the current word group """ + * + * if self.current_word_group: # <<<<<<<<<<<<<< + * self.analysis["word_groups"].append(tuple(self.current_word_group)) + * self.current_word_group = None + */ + __pyx_t_1 = __Pyx_PyObject_IsTrue(__pyx_v_self->current_word_group); if (unlikely(__pyx_t_1 < 0)) __PYX_ERR(0, 341, __pyx_L1_error) + if (__pyx_t_1) { + + /* "gumbocy.pyx":342 + * + * if self.current_word_group: + * self.analysis["word_groups"].append(tuple(self.current_word_group)) # <<<<<<<<<<<<<< + * self.current_word_group = None + * + */ + if (unlikely(__pyx_v_self->analysis == Py_None)) { + PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable"); + __PYX_ERR(0, 342, __pyx_L1_error) + } + __pyx_t_2 = __Pyx_PyDict_GetItem(__pyx_v_self->analysis, __pyx_n_s_word_groups); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 342, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_2); + __pyx_t_3 = PySequence_Tuple(__pyx_v_self->current_word_group); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 342, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __pyx_t_4 = __Pyx_PyObject_Append(__pyx_t_2, __pyx_t_3); if (unlikely(__pyx_t_4 == -1)) __PYX_ERR(0, 342, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + + /* "gumbocy.pyx":343 + * if self.current_word_group: + * self.analysis["word_groups"].append(tuple(self.current_word_group)) + * self.current_word_group = None # <<<<<<<<<<<<<< + * + * + */ + __Pyx_INCREF(Py_None); + __Pyx_GIVEREF(Py_None); + __Pyx_GOTREF(__pyx_v_self->current_word_group); + __Pyx_DECREF(__pyx_v_self->current_word_group); + __pyx_v_self->current_word_group = Py_None; + + /* "gumbocy.pyx":341 + * """ Close the current word group """ + * + * if self.current_word_group: # <<<<<<<<<<<<<< + * self.analysis["word_groups"].append(tuple(self.current_word_group)) + * self.current_word_group = None + */ + } + + /* "gumbocy.pyx":338 + * return attrs + * + * cdef void close_word_group(self): # <<<<<<<<<<<<<< + * """ Close the current word group """ + * + */ + + /* function exit code */ + goto __pyx_L0; + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_2); + __Pyx_XDECREF(__pyx_t_3); + __Pyx_WriteUnraisable("gumbocy.HTMLParser.close_word_group", __pyx_clineno, __pyx_lineno, __pyx_filename, 0, 0); + __pyx_L0:; + __Pyx_RefNannyFinishContext(); +} + +/* "gumbocy.pyx":346 + * + * + * cdef void add_text(self, text): # <<<<<<<<<<<<<< + * + * if not self.current_word_group: + */ + +static void __pyx_f_7gumbocy_10HTMLParser_add_text(struct __pyx_obj_7gumbocy_HTMLParser *__pyx_v_self, PyObject *__pyx_v_text) { + __Pyx_RefNannyDeclarations + int __pyx_t_1; + int __pyx_t_2; + PyObject *__pyx_t_3 = NULL; + PyObject *__pyx_t_4 = NULL; + PyObject *__pyx_t_5 = NULL; + Py_ssize_t __pyx_t_6; + PyObject *__pyx_t_7 = NULL; + PyObject *__pyx_t_8 = NULL; + __Pyx_RefNannySetupContext("add_text", 0); + + /* "gumbocy.pyx":348 + * cdef void add_text(self, text): + * + * if not self.current_word_group: # <<<<<<<<<<<<<< + * self.current_word_group = [text.strip(), self.current_stack[-1]] + * else: + */ + __pyx_t_1 = __Pyx_PyObject_IsTrue(__pyx_v_self->current_word_group); if (unlikely(__pyx_t_1 < 0)) __PYX_ERR(0, 348, __pyx_L1_error) + __pyx_t_2 = ((!__pyx_t_1) != 0); + if (__pyx_t_2) { + + /* "gumbocy.pyx":349 + * + * if not self.current_word_group: + * self.current_word_group = [text.strip(), self.current_stack[-1]] # <<<<<<<<<<<<<< + * else: + * self.current_word_group[0] += " " + text.strip() + */ + __pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_v_text, __pyx_n_s_strip); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 349, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_4); + __pyx_t_5 = NULL; + if (CYTHON_COMPILING_IN_CPYTHON && likely(PyMethod_Check(__pyx_t_4))) { + __pyx_t_5 = PyMethod_GET_SELF(__pyx_t_4); + if (likely(__pyx_t_5)) { + PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_4); + __Pyx_INCREF(__pyx_t_5); + __Pyx_INCREF(function); + __Pyx_DECREF_SET(__pyx_t_4, function); + } + } + if (__pyx_t_5) { + __pyx_t_3 = __Pyx_PyObject_CallOneArg(__pyx_t_4, __pyx_t_5); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 349, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; + } else { + __pyx_t_3 = __Pyx_PyObject_CallNoArg(__pyx_t_4); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 349, __pyx_L1_error) + } + __Pyx_GOTREF(__pyx_t_3); + __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; + if (unlikely(__pyx_v_self->current_stack == Py_None)) { + PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable"); + __PYX_ERR(0, 349, __pyx_L1_error) + } + __pyx_t_4 = __Pyx_GetItemInt_List(__pyx_v_self->current_stack, -1L, long, 1, __Pyx_PyInt_From_long, 1, 1, 1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 349, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_4); + __pyx_t_5 = PyList_New(2); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 349, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_5); + __Pyx_GIVEREF(__pyx_t_3); + PyList_SET_ITEM(__pyx_t_5, 0, __pyx_t_3); + __Pyx_GIVEREF(__pyx_t_4); + PyList_SET_ITEM(__pyx_t_5, 1, __pyx_t_4); + __pyx_t_3 = 0; + __pyx_t_4 = 0; + __Pyx_GIVEREF(__pyx_t_5); + __Pyx_GOTREF(__pyx_v_self->current_word_group); + __Pyx_DECREF(__pyx_v_self->current_word_group); + __pyx_v_self->current_word_group = __pyx_t_5; + __pyx_t_5 = 0; + + /* "gumbocy.pyx":348 + * cdef void add_text(self, text): + * + * if not self.current_word_group: # <<<<<<<<<<<<<< + * self.current_word_group = [text.strip(), self.current_stack[-1]] + * else: + */ + goto __pyx_L3; + } + + /* "gumbocy.pyx":351 + * self.current_word_group = [text.strip(), self.current_stack[-1]] + * else: + * self.current_word_group[0] += " " + text.strip() # <<<<<<<<<<<<<< + * + * cdef void add_hyperlink_text(self, text): + */ + /*else*/ { + __Pyx_INCREF(__pyx_v_self->current_word_group); + __pyx_t_5 = __pyx_v_self->current_word_group; + __pyx_t_6 = 0; + __pyx_t_4 = __Pyx_GetItemInt(__pyx_t_5, __pyx_t_6, Py_ssize_t, 1, PyInt_FromSsize_t, 0, 1, 1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 351, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_4); + __pyx_t_7 = __Pyx_PyObject_GetAttrStr(__pyx_v_text, __pyx_n_s_strip); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 351, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_7); + __pyx_t_8 = NULL; + if (CYTHON_COMPILING_IN_CPYTHON && likely(PyMethod_Check(__pyx_t_7))) { + __pyx_t_8 = PyMethod_GET_SELF(__pyx_t_7); + if (likely(__pyx_t_8)) { + PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_7); + __Pyx_INCREF(__pyx_t_8); + __Pyx_INCREF(function); + __Pyx_DECREF_SET(__pyx_t_7, function); + } + } + if (__pyx_t_8) { + __pyx_t_3 = __Pyx_PyObject_CallOneArg(__pyx_t_7, __pyx_t_8); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 351, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0; + } else { + __pyx_t_3 = __Pyx_PyObject_CallNoArg(__pyx_t_7); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 351, __pyx_L1_error) + } + __Pyx_GOTREF(__pyx_t_3); + __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0; + __pyx_t_7 = PyNumber_Add(__pyx_kp_s__4, __pyx_t_3); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 351, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_7); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __pyx_t_3 = PyNumber_InPlaceAdd(__pyx_t_4, __pyx_t_7); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 351, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; + __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0; + if (unlikely(__Pyx_SetItemInt(__pyx_t_5, __pyx_t_6, __pyx_t_3, Py_ssize_t, 1, PyInt_FromSsize_t, 0, 1, 1) < 0)) __PYX_ERR(0, 351, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; + } + __pyx_L3:; + + /* "gumbocy.pyx":346 + * + * + * cdef void add_text(self, text): # <<<<<<<<<<<<<< + * + * if not self.current_word_group: + */ + + /* function exit code */ + goto __pyx_L0; + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_3); + __Pyx_XDECREF(__pyx_t_4); + __Pyx_XDECREF(__pyx_t_5); + __Pyx_XDECREF(__pyx_t_7); + __Pyx_XDECREF(__pyx_t_8); + __Pyx_WriteUnraisable("gumbocy.HTMLParser.add_text", __pyx_clineno, __pyx_lineno, __pyx_filename, 0, 0); + __pyx_L0:; + __Pyx_RefNannyFinishContext(); +} + +/* "gumbocy.pyx":353 + * self.current_word_group[0] += " " + text.strip() + * + * cdef void add_hyperlink_text(self, text): # <<<<<<<<<<<<<< + * if self.current_hyperlink: + * self.current_hyperlink[1] += text + */ + +static void __pyx_f_7gumbocy_10HTMLParser_add_hyperlink_text(struct __pyx_obj_7gumbocy_HTMLParser *__pyx_v_self, PyObject *__pyx_v_text) { + __Pyx_RefNannyDeclarations + int __pyx_t_1; + PyObject *__pyx_t_2 = NULL; + Py_ssize_t __pyx_t_3; + PyObject *__pyx_t_4 = NULL; + PyObject *__pyx_t_5 = NULL; + __Pyx_RefNannySetupContext("add_hyperlink_text", 0); + + /* "gumbocy.pyx":354 + * + * cdef void add_hyperlink_text(self, text): + * if self.current_hyperlink: # <<<<<<<<<<<<<< + * self.current_hyperlink[1] += text + * + */ + __pyx_t_1 = __Pyx_PyObject_IsTrue(__pyx_v_self->current_hyperlink); if (unlikely(__pyx_t_1 < 0)) __PYX_ERR(0, 354, __pyx_L1_error) + if (__pyx_t_1) { + + /* "gumbocy.pyx":355 + * cdef void add_hyperlink_text(self, text): + * if self.current_hyperlink: + * self.current_hyperlink[1] += text # <<<<<<<<<<<<<< + * + * cdef void open_hyperlink(self, Attributes attrs): + */ + __Pyx_INCREF(__pyx_v_self->current_hyperlink); + __pyx_t_2 = __pyx_v_self->current_hyperlink; + __pyx_t_3 = 1; + __pyx_t_4 = __Pyx_GetItemInt(__pyx_t_2, __pyx_t_3, Py_ssize_t, 1, PyInt_FromSsize_t, 0, 1, 1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 355, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_4); + __pyx_t_5 = PyNumber_InPlaceAdd(__pyx_t_4, __pyx_v_text); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 355, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_5); + __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; + if (unlikely(__Pyx_SetItemInt(__pyx_t_2, __pyx_t_3, __pyx_t_5, Py_ssize_t, 1, PyInt_FromSsize_t, 0, 1, 1) < 0)) __PYX_ERR(0, 355, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + + /* "gumbocy.pyx":354 + * + * cdef void add_hyperlink_text(self, text): + * if self.current_hyperlink: # <<<<<<<<<<<<<< + * self.current_hyperlink[1] += text + * + */ + } + + /* "gumbocy.pyx":353 + * self.current_word_group[0] += " " + text.strip() + * + * cdef void add_hyperlink_text(self, text): # <<<<<<<<<<<<<< + * if self.current_hyperlink: + * self.current_hyperlink[1] += text + */ + + /* function exit code */ + goto __pyx_L0; + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_2); + __Pyx_XDECREF(__pyx_t_4); + __Pyx_XDECREF(__pyx_t_5); + __Pyx_WriteUnraisable("gumbocy.HTMLParser.add_hyperlink_text", __pyx_clineno, __pyx_lineno, __pyx_filename, 0, 0); + __pyx_L0:; + __Pyx_RefNannyFinishContext(); +} + +/* "gumbocy.pyx":357 + * self.current_hyperlink[1] += text + * + * cdef void open_hyperlink(self, Attributes attrs): # <<<<<<<<<<<<<< + * + * if not attrs.values.get(ATTR_HREF): + */ + +static void __pyx_f_7gumbocy_10HTMLParser_open_hyperlink(struct __pyx_obj_7gumbocy_HTMLParser *__pyx_v_self, struct __pyx_obj_7gumbocy_Attributes *__pyx_v_attrs) { + __Pyx_RefNannyDeclarations + PyObject *__pyx_t_1 = NULL; + PyObject *__pyx_t_2 = NULL; + int __pyx_t_3; + int __pyx_t_4; + Py_ssize_t __pyx_t_5; + char const *__pyx_t_6; + __Pyx_RefNannySetupContext("open_hyperlink", 0); + + /* "gumbocy.pyx":359 + * cdef void open_hyperlink(self, Attributes attrs): + * + * if not attrs.values.get(ATTR_HREF): # <<<<<<<<<<<<<< + * return + * + */ + if (unlikely(__pyx_v_attrs->values == Py_None)) { + PyErr_Format(PyExc_AttributeError, "'NoneType' object has no attribute '%s'", "get"); + __PYX_ERR(0, 359, __pyx_L1_error) + } + __pyx_t_1 = __Pyx_PyInt_From___pyx_t_7gumbocy_AttributeNames(__pyx_e_7gumbocy_ATTR_HREF); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 359, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __pyx_t_2 = __Pyx_PyDict_GetItemDefault(__pyx_v_attrs->values, __pyx_t_1, Py_None); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 359, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_2); + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __pyx_t_3 = __Pyx_PyObject_IsTrue(__pyx_t_2); if (unlikely(__pyx_t_3 < 0)) __PYX_ERR(0, 359, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + __pyx_t_4 = ((!__pyx_t_3) != 0); + if (__pyx_t_4) { + + /* "gumbocy.pyx":360 + * + * if not attrs.values.get(ATTR_HREF): + * return # <<<<<<<<<<<<<< + * + * if len(attrs.values[ATTR_HREF]) == 0: + */ + goto __pyx_L0; + + /* "gumbocy.pyx":359 + * cdef void open_hyperlink(self, Attributes attrs): + * + * if not attrs.values.get(ATTR_HREF): # <<<<<<<<<<<<<< + * return + * + */ + } + + /* "gumbocy.pyx":362 + * return + * + * if len(attrs.values[ATTR_HREF]) == 0: # <<<<<<<<<<<<<< + * return + * + */ + if (unlikely(__pyx_v_attrs->values == Py_None)) { + PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable"); + __PYX_ERR(0, 362, __pyx_L1_error) + } + __pyx_t_2 = __Pyx_PyInt_From___pyx_t_7gumbocy_AttributeNames(__pyx_e_7gumbocy_ATTR_HREF); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 362, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_2); + __pyx_t_1 = __Pyx_PyDict_GetItem(__pyx_v_attrs->values, __pyx_t_2); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 362, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + __pyx_t_5 = PyObject_Length(__pyx_t_1); if (unlikely(__pyx_t_5 == -1)) __PYX_ERR(0, 362, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __pyx_t_4 = ((__pyx_t_5 == 0) != 0); + if (__pyx_t_4) { + + /* "gumbocy.pyx":363 + * + * if len(attrs.values[ATTR_HREF]) == 0: + * return # <<<<<<<<<<<<<< + * + * if re2_search(attrs.values[ATTR_HREF], deref(_RE2_IGNORED_HREF)): + */ + goto __pyx_L0; + + /* "gumbocy.pyx":362 + * return + * + * if len(attrs.values[ATTR_HREF]) == 0: # <<<<<<<<<<<<<< + * return + * + */ + } + + /* "gumbocy.pyx":365 + * return + * + * if re2_search(attrs.values[ATTR_HREF], deref(_RE2_IGNORED_HREF)): # <<<<<<<<<<<<<< + * return + * + */ + if (unlikely(__pyx_v_attrs->values == Py_None)) { + PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable"); + __PYX_ERR(0, 365, __pyx_L1_error) + } + __pyx_t_1 = __Pyx_PyInt_From___pyx_t_7gumbocy_AttributeNames(__pyx_e_7gumbocy_ATTR_HREF); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 365, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __pyx_t_2 = __Pyx_PyDict_GetItem(__pyx_v_attrs->values, __pyx_t_1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 365, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_2); + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __pyx_t_6 = __Pyx_PyObject_AsString(__pyx_t_2); if (unlikely((!__pyx_t_6) && PyErr_Occurred())) __PYX_ERR(0, 365, __pyx_L1_error) + __pyx_t_4 = (__pyx_f_7gumbocy_re2_search(__pyx_t_6, (*__pyx_v_7gumbocy__RE2_IGNORED_HREF)) != 0); + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + if (__pyx_t_4) { + + /* "gumbocy.pyx":366 + * + * if re2_search(attrs.values[ATTR_HREF], deref(_RE2_IGNORED_HREF)): + * return # <<<<<<<<<<<<<< + * + * self.close_hyperlink() + */ + goto __pyx_L0; + + /* "gumbocy.pyx":365 + * return + * + * if re2_search(attrs.values[ATTR_HREF], deref(_RE2_IGNORED_HREF)): # <<<<<<<<<<<<<< + * return + * + */ + } + + /* "gumbocy.pyx":368 + * return + * + * self.close_hyperlink() # <<<<<<<<<<<<<< + * self.current_hyperlink = [attrs.values[ATTR_HREF], ""] + * + */ + ((struct __pyx_vtabstruct_7gumbocy_HTMLParser *)__pyx_v_self->__pyx_vtab)->close_hyperlink(__pyx_v_self); + + /* "gumbocy.pyx":369 + * + * self.close_hyperlink() + * self.current_hyperlink = [attrs.values[ATTR_HREF], ""] # <<<<<<<<<<<<<< + * + * cdef void close_hyperlink(self): + */ + if (unlikely(__pyx_v_attrs->values == Py_None)) { + PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable"); + __PYX_ERR(0, 369, __pyx_L1_error) + } + __pyx_t_2 = __Pyx_PyInt_From___pyx_t_7gumbocy_AttributeNames(__pyx_e_7gumbocy_ATTR_HREF); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 369, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_2); + __pyx_t_1 = __Pyx_PyDict_GetItem(__pyx_v_attrs->values, __pyx_t_2); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 369, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + __pyx_t_2 = PyList_New(2); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 369, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_2); + __Pyx_GIVEREF(__pyx_t_1); + PyList_SET_ITEM(__pyx_t_2, 0, __pyx_t_1); + __Pyx_INCREF(__pyx_kp_s__5); + __Pyx_GIVEREF(__pyx_kp_s__5); + PyList_SET_ITEM(__pyx_t_2, 1, __pyx_kp_s__5); + __pyx_t_1 = 0; + __Pyx_GIVEREF(__pyx_t_2); + __Pyx_GOTREF(__pyx_v_self->current_hyperlink); + __Pyx_DECREF(__pyx_v_self->current_hyperlink); + __pyx_v_self->current_hyperlink = __pyx_t_2; + __pyx_t_2 = 0; + + /* "gumbocy.pyx":357 + * self.current_hyperlink[1] += text + * + * cdef void open_hyperlink(self, Attributes attrs): # <<<<<<<<<<<<<< + * + * if not attrs.values.get(ATTR_HREF): + */ + + /* function exit code */ + goto __pyx_L0; + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_1); + __Pyx_XDECREF(__pyx_t_2); + __Pyx_WriteUnraisable("gumbocy.HTMLParser.open_hyperlink", __pyx_clineno, __pyx_lineno, __pyx_filename, 0, 0); + __pyx_L0:; + __Pyx_RefNannyFinishContext(); +} + +/* "gumbocy.pyx":371 + * self.current_hyperlink = [attrs.values[ATTR_HREF], ""] + * + * cdef void close_hyperlink(self): # <<<<<<<<<<<<<< + * if self.current_hyperlink: + * href = self.current_hyperlink[0] + */ + +static void __pyx_f_7gumbocy_10HTMLParser_close_hyperlink(struct __pyx_obj_7gumbocy_HTMLParser *__pyx_v_self) { + PyObject *__pyx_v_href = NULL; + __Pyx_RefNannyDeclarations + int __pyx_t_1; + PyObject *__pyx_t_2 = NULL; + char const *__pyx_t_3; + PyObject *__pyx_t_4 = NULL; + int __pyx_t_5; + int __pyx_t_6; + __Pyx_RefNannySetupContext("close_hyperlink", 0); + + /* "gumbocy.pyx":372 + * + * cdef void close_hyperlink(self): + * if self.current_hyperlink: # <<<<<<<<<<<<<< + * href = self.current_hyperlink[0] + * + */ + __pyx_t_1 = __Pyx_PyObject_IsTrue(__pyx_v_self->current_hyperlink); if (unlikely(__pyx_t_1 < 0)) __PYX_ERR(0, 372, __pyx_L1_error) + if (__pyx_t_1) { + + /* "gumbocy.pyx":373 + * cdef void close_hyperlink(self): + * if self.current_hyperlink: + * href = self.current_hyperlink[0] # <<<<<<<<<<<<<< + * + * # TODO: absolute links to same domain + */ + __pyx_t_2 = __Pyx_GetItemInt(__pyx_v_self->current_hyperlink, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 373, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_2); + __pyx_v_href = __pyx_t_2; + __pyx_t_2 = 0; + + /* "gumbocy.pyx":376 + * + * # TODO: absolute links to same domain + * if re2_search(href, deref(_RE2_EXTERNAL_HREF)): # <<<<<<<<<<<<<< + * if self.analyze_external_hyperlinks: + * if href.startswith("http://") or href.startswith("https://") or href.startswith("//"): + */ + __pyx_t_3 = __Pyx_PyObject_AsString(__pyx_v_href); if (unlikely((!__pyx_t_3) && PyErr_Occurred())) __PYX_ERR(0, 376, __pyx_L1_error) + __pyx_t_1 = (__pyx_f_7gumbocy_re2_search(__pyx_t_3, (*__pyx_v_7gumbocy__RE2_EXTERNAL_HREF)) != 0); + if (__pyx_t_1) { + + /* "gumbocy.pyx":377 + * # TODO: absolute links to same domain + * if re2_search(href, deref(_RE2_EXTERNAL_HREF)): + * if self.analyze_external_hyperlinks: # <<<<<<<<<<<<<< + * if href.startswith("http://") or href.startswith("https://") or href.startswith("//"): + * self.analysis["external_hyperlinks"].append(tuple(self.current_hyperlink)) + */ + __pyx_t_1 = (__pyx_v_self->analyze_external_hyperlinks != 0); + if (__pyx_t_1) { + + /* "gumbocy.pyx":378 + * if re2_search(href, deref(_RE2_EXTERNAL_HREF)): + * if self.analyze_external_hyperlinks: + * if href.startswith("http://") or href.startswith("https://") or href.startswith("//"): # <<<<<<<<<<<<<< + * self.analysis["external_hyperlinks"].append(tuple(self.current_hyperlink)) + * else: + */ + __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_v_href, __pyx_n_s_startswith); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 378, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_2); + __pyx_t_4 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_tuple__6, NULL); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 378, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_4); + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + __pyx_t_5 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_5 < 0)) __PYX_ERR(0, 378, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; + if (!__pyx_t_5) { + } else { + __pyx_t_1 = __pyx_t_5; + goto __pyx_L7_bool_binop_done; + } + __pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_v_href, __pyx_n_s_startswith); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 378, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_4); + __pyx_t_2 = __Pyx_PyObject_Call(__pyx_t_4, __pyx_tuple__7, NULL); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 378, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_2); + __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; + __pyx_t_5 = __Pyx_PyObject_IsTrue(__pyx_t_2); if (unlikely(__pyx_t_5 < 0)) __PYX_ERR(0, 378, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + if (!__pyx_t_5) { + } else { + __pyx_t_1 = __pyx_t_5; + goto __pyx_L7_bool_binop_done; + } + __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_v_href, __pyx_n_s_startswith); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 378, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_2); + __pyx_t_4 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_tuple__9, NULL); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 378, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_4); + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + __pyx_t_5 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_5 < 0)) __PYX_ERR(0, 378, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; + __pyx_t_1 = __pyx_t_5; + __pyx_L7_bool_binop_done:; + if (__pyx_t_1) { + + /* "gumbocy.pyx":379 + * if self.analyze_external_hyperlinks: + * if href.startswith("http://") or href.startswith("https://") or href.startswith("//"): + * self.analysis["external_hyperlinks"].append(tuple(self.current_hyperlink)) # <<<<<<<<<<<<<< + * else: + * if self.analyze_internal_hyperlinks: + */ + if (unlikely(__pyx_v_self->analysis == Py_None)) { + PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable"); + __PYX_ERR(0, 379, __pyx_L1_error) + } + __pyx_t_4 = __Pyx_PyDict_GetItem(__pyx_v_self->analysis, __pyx_n_s_external_hyperlinks); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 379, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_4); + __pyx_t_2 = PySequence_Tuple(__pyx_v_self->current_hyperlink); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 379, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_2); + __pyx_t_6 = __Pyx_PyObject_Append(__pyx_t_4, __pyx_t_2); if (unlikely(__pyx_t_6 == -1)) __PYX_ERR(0, 379, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + + /* "gumbocy.pyx":378 + * if re2_search(href, deref(_RE2_EXTERNAL_HREF)): + * if self.analyze_external_hyperlinks: + * if href.startswith("http://") or href.startswith("https://") or href.startswith("//"): # <<<<<<<<<<<<<< + * self.analysis["external_hyperlinks"].append(tuple(self.current_hyperlink)) + * else: + */ + } + + /* "gumbocy.pyx":377 + * # TODO: absolute links to same domain + * if re2_search(href, deref(_RE2_EXTERNAL_HREF)): + * if self.analyze_external_hyperlinks: # <<<<<<<<<<<<<< + * if href.startswith("http://") or href.startswith("https://") or href.startswith("//"): + * self.analysis["external_hyperlinks"].append(tuple(self.current_hyperlink)) + */ + } + + /* "gumbocy.pyx":376 + * + * # TODO: absolute links to same domain + * if re2_search(href, deref(_RE2_EXTERNAL_HREF)): # <<<<<<<<<<<<<< + * if self.analyze_external_hyperlinks: + * if href.startswith("http://") or href.startswith("https://") or href.startswith("//"): + */ + goto __pyx_L4; + } + + /* "gumbocy.pyx":381 + * self.analysis["external_hyperlinks"].append(tuple(self.current_hyperlink)) + * else: + * if self.analyze_internal_hyperlinks: # <<<<<<<<<<<<<< + * self.analysis["internal_hyperlinks"].append(tuple(self.current_hyperlink)) + * + */ + /*else*/ { + __pyx_t_1 = (__pyx_v_self->analyze_internal_hyperlinks != 0); + if (__pyx_t_1) { + + /* "gumbocy.pyx":382 + * else: + * if self.analyze_internal_hyperlinks: + * self.analysis["internal_hyperlinks"].append(tuple(self.current_hyperlink)) # <<<<<<<<<<<<<< + * + * self.current_hyperlink = None + */ + if (unlikely(__pyx_v_self->analysis == Py_None)) { + PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable"); + __PYX_ERR(0, 382, __pyx_L1_error) + } + __pyx_t_2 = __Pyx_PyDict_GetItem(__pyx_v_self->analysis, __pyx_n_s_internal_hyperlinks); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 382, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_2); + __pyx_t_4 = PySequence_Tuple(__pyx_v_self->current_hyperlink); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 382, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_4); + __pyx_t_6 = __Pyx_PyObject_Append(__pyx_t_2, __pyx_t_4); if (unlikely(__pyx_t_6 == -1)) __PYX_ERR(0, 382, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; + + /* "gumbocy.pyx":381 + * self.analysis["external_hyperlinks"].append(tuple(self.current_hyperlink)) + * else: + * if self.analyze_internal_hyperlinks: # <<<<<<<<<<<<<< + * self.analysis["internal_hyperlinks"].append(tuple(self.current_hyperlink)) + * + */ + } + } + __pyx_L4:; + + /* "gumbocy.pyx":384 + * self.analysis["internal_hyperlinks"].append(tuple(self.current_hyperlink)) + * + * self.current_hyperlink = None # <<<<<<<<<<<<<< + * + * cdef bint _traverse_node(self, int level, gumbocy.GumboNode* node, bint is_head, bint is_hidden, bint is_boilerplate, bint is_boilerplate_bypassed, bint is_hyperlink): + */ + __Pyx_INCREF(Py_None); + __Pyx_GIVEREF(Py_None); + __Pyx_GOTREF(__pyx_v_self->current_hyperlink); + __Pyx_DECREF(__pyx_v_self->current_hyperlink); + __pyx_v_self->current_hyperlink = Py_None; + + /* "gumbocy.pyx":372 + * + * cdef void close_hyperlink(self): + * if self.current_hyperlink: # <<<<<<<<<<<<<< + * href = self.current_hyperlink[0] + * + */ + } + + /* "gumbocy.pyx":371 + * self.current_hyperlink = [attrs.values[ATTR_HREF], ""] + * + * cdef void close_hyperlink(self): # <<<<<<<<<<<<<< + * if self.current_hyperlink: + * href = self.current_hyperlink[0] + */ + + /* function exit code */ + goto __pyx_L0; + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_2); + __Pyx_XDECREF(__pyx_t_4); + __Pyx_WriteUnraisable("gumbocy.HTMLParser.close_hyperlink", __pyx_clineno, __pyx_lineno, __pyx_filename, 0, 0); + __pyx_L0:; + __Pyx_XDECREF(__pyx_v_href); + __Pyx_RefNannyFinishContext(); +} + +/* "gumbocy.pyx":386 + * self.current_hyperlink = None + * + * cdef bint _traverse_node(self, int level, gumbocy.GumboNode* node, bint is_head, bint is_hidden, bint is_boilerplate, bint is_boilerplate_bypassed, bint is_hyperlink): # <<<<<<<<<<<<<< + * """ Traverses the node tree. Return 1 to stop at this level """ + * + */ + +static int __pyx_f_7gumbocy_10HTMLParser__traverse_node(struct __pyx_obj_7gumbocy_HTMLParser *__pyx_v_self, int __pyx_v_level, GumboNode *__pyx_v_node, int __pyx_v_is_head, int __pyx_v_is_hidden, int __pyx_v_is_boilerplate, int __pyx_v_is_boilerplate_bypassed, int __pyx_v_is_hyperlink) { + GumboStringPiece __pyx_v_gsp; + char const *__pyx_v_tag_name; + int __pyx_v_tag_n; + PyObject *__pyx_v_py_tag_name = NULL; + struct __pyx_obj_7gumbocy_Attributes *__pyx_v_attrs = NULL; + PyObject *__pyx_v_v = NULL; + GumboNode *__pyx_v_first_child; + unsigned int __pyx_v_i; + GumboNode *__pyx_v_child; + int __pyx_r; + __Pyx_RefNannyDeclarations + int __pyx_t_1; + int __pyx_t_2; + PyObject *__pyx_t_3 = NULL; + GumboStringPiece __pyx_t_4; + PyObject *__pyx_t_5 = NULL; + PyObject *__pyx_t_6 = NULL; + char const *__pyx_t_7; + Py_ssize_t __pyx_t_8; + char const *__pyx_t_9; + int __pyx_t_10; + PyObject *__pyx_t_11 = NULL; + int __pyx_t_12; + unsigned int __pyx_t_13; + unsigned int __pyx_t_14; + __Pyx_RefNannySetupContext("_traverse_node", 0); + + /* "gumbocy.pyx":393 + * cdef int tag_n + * + * if level > self.nesting_limit: # <<<<<<<<<<<<<< + * return 0 + * + */ + __pyx_t_1 = ((__pyx_v_level > __pyx_v_self->nesting_limit) != 0); + if (__pyx_t_1) { + + /* "gumbocy.pyx":394 + * + * if level > self.nesting_limit: + * return 0 # <<<<<<<<<<<<<< + * + * if node.type == gumbocy.GUMBO_NODE_TEXT: + */ + __pyx_r = 0; + goto __pyx_L0; + + /* "gumbocy.pyx":393 + * cdef int tag_n + * + * if level > self.nesting_limit: # <<<<<<<<<<<<<< + * return 0 + * + */ + } + + /* "gumbocy.pyx":396 + * return 0 + * + * if node.type == gumbocy.GUMBO_NODE_TEXT: # <<<<<<<<<<<<<< + * + * if (self.analyze_internal_hyperlinks or self.analyze_external_hyperlinks) and is_hyperlink: + */ + switch (__pyx_v_node->type) { + case GUMBO_NODE_TEXT: + + /* "gumbocy.pyx":398 + * if node.type == gumbocy.GUMBO_NODE_TEXT: + * + * if (self.analyze_internal_hyperlinks or self.analyze_external_hyperlinks) and is_hyperlink: # <<<<<<<<<<<<<< + * self.add_hyperlink_text(node.v.text.text) + * + */ + __pyx_t_2 = (__pyx_v_self->analyze_internal_hyperlinks != 0); + if (!__pyx_t_2) { + } else { + goto __pyx_L6_next_and; + } + __pyx_t_2 = (__pyx_v_self->analyze_external_hyperlinks != 0); + if (__pyx_t_2) { + } else { + __pyx_t_1 = __pyx_t_2; + goto __pyx_L5_bool_binop_done; + } + __pyx_L6_next_and:; + __pyx_t_2 = (__pyx_v_is_hyperlink != 0); + __pyx_t_1 = __pyx_t_2; + __pyx_L5_bool_binop_done:; + if (__pyx_t_1) { + + /* "gumbocy.pyx":399 + * + * if (self.analyze_internal_hyperlinks or self.analyze_external_hyperlinks) and is_hyperlink: + * self.add_hyperlink_text(node.v.text.text) # <<<<<<<<<<<<<< + * + * if self.analyze_word_groups and not is_head and not is_hidden and (not is_boilerplate or is_boilerplate_bypassed): + */ + __pyx_t_3 = __Pyx_PyBytes_FromString(__pyx_v_node->v.text.text); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 399, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + ((struct __pyx_vtabstruct_7gumbocy_HTMLParser *)__pyx_v_self->__pyx_vtab)->add_hyperlink_text(__pyx_v_self, __pyx_t_3); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + + /* "gumbocy.pyx":398 + * if node.type == gumbocy.GUMBO_NODE_TEXT: + * + * if (self.analyze_internal_hyperlinks or self.analyze_external_hyperlinks) and is_hyperlink: # <<<<<<<<<<<<<< + * self.add_hyperlink_text(node.v.text.text) + * + */ + } + + /* "gumbocy.pyx":401 + * self.add_hyperlink_text(node.v.text.text) + * + * if self.analyze_word_groups and not is_head and not is_hidden and (not is_boilerplate or is_boilerplate_bypassed): # <<<<<<<<<<<<<< + * self.add_text(node.v.text.text) + * + */ + __pyx_t_2 = (__pyx_v_self->analyze_word_groups != 0); + if (__pyx_t_2) { + } else { + __pyx_t_1 = __pyx_t_2; + goto __pyx_L9_bool_binop_done; + } + __pyx_t_2 = ((!(__pyx_v_is_head != 0)) != 0); + if (__pyx_t_2) { + } else { + __pyx_t_1 = __pyx_t_2; + goto __pyx_L9_bool_binop_done; + } + __pyx_t_2 = ((!(__pyx_v_is_hidden != 0)) != 0); + if (__pyx_t_2) { + } else { + __pyx_t_1 = __pyx_t_2; + goto __pyx_L9_bool_binop_done; + } + __pyx_t_2 = ((!(__pyx_v_is_boilerplate != 0)) != 0); + if (!__pyx_t_2) { + } else { + __pyx_t_1 = __pyx_t_2; + goto __pyx_L9_bool_binop_done; + } + __pyx_t_2 = (__pyx_v_is_boilerplate_bypassed != 0); + __pyx_t_1 = __pyx_t_2; + __pyx_L9_bool_binop_done:; + if (__pyx_t_1) { + + /* "gumbocy.pyx":402 + * + * if self.analyze_word_groups and not is_head and not is_hidden and (not is_boilerplate or is_boilerplate_bypassed): + * self.add_text(node.v.text.text) # <<<<<<<<<<<<<< + * + * elif node.type == gumbocy.GUMBO_NODE_ELEMENT: + */ + __pyx_t_3 = __Pyx_PyBytes_FromString(__pyx_v_node->v.text.text); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 402, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + ((struct __pyx_vtabstruct_7gumbocy_HTMLParser *)__pyx_v_self->__pyx_vtab)->add_text(__pyx_v_self, __pyx_t_3); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + + /* "gumbocy.pyx":401 + * self.add_hyperlink_text(node.v.text.text) + * + * if self.analyze_word_groups and not is_head and not is_hidden and (not is_boilerplate or is_boilerplate_bypassed): # <<<<<<<<<<<<<< + * self.add_text(node.v.text.text) + * + */ + } + + /* "gumbocy.pyx":396 + * return 0 + * + * if node.type == gumbocy.GUMBO_NODE_TEXT: # <<<<<<<<<<<<<< + * + * if (self.analyze_internal_hyperlinks or self.analyze_external_hyperlinks) and is_hyperlink: + */ + break; + + /* "gumbocy.pyx":404 + * self.add_text(node.v.text.text) + * + * elif node.type == gumbocy.GUMBO_NODE_ELEMENT: # <<<<<<<<<<<<<< + * + * tag_n = node.v.element.tag + */ + case GUMBO_NODE_ELEMENT: + + /* "gumbocy.pyx":406 + * elif node.type == gumbocy.GUMBO_NODE_ELEMENT: + * + * tag_n = node.v.element.tag # <<<<<<<<<<<<<< + * + * if self.head_only and self.tags_ignore_head_only.count(tag_n): + */ + __pyx_v_tag_n = ((int)__pyx_v_node->v.element.tag); + + /* "gumbocy.pyx":408 + * tag_n = node.v.element.tag + * + * if self.head_only and self.tags_ignore_head_only.count(tag_n): # <<<<<<<<<<<<<< + * return 1 + * + */ + __pyx_t_2 = (__pyx_v_self->head_only != 0); + if (__pyx_t_2) { + } else { + __pyx_t_1 = __pyx_t_2; + goto __pyx_L15_bool_binop_done; + } + __pyx_t_2 = (__pyx_v_self->tags_ignore_head_only.count(__pyx_v_tag_n) != 0); + __pyx_t_1 = __pyx_t_2; + __pyx_L15_bool_binop_done:; + if (__pyx_t_1) { + + /* "gumbocy.pyx":409 + * + * if self.head_only and self.tags_ignore_head_only.count(tag_n): + * return 1 # <<<<<<<<<<<<<< + * + * if self.tags_ignore.count(tag_n): + */ + __pyx_r = 1; + goto __pyx_L0; + + /* "gumbocy.pyx":408 + * tag_n = node.v.element.tag + * + * if self.head_only and self.tags_ignore_head_only.count(tag_n): # <<<<<<<<<<<<<< + * return 1 + * + */ + } + + /* "gumbocy.pyx":411 + * return 1 + * + * if self.tags_ignore.count(tag_n): # <<<<<<<<<<<<<< + * return 0 + * + */ + __pyx_t_1 = (__pyx_v_self->tags_ignore.count(__pyx_v_tag_n) != 0); + if (__pyx_t_1) { + + /* "gumbocy.pyx":412 + * + * if self.tags_ignore.count(tag_n): + * return 0 # <<<<<<<<<<<<<< + * + * tag_name = gumbocy.gumbo_normalized_tagname(node.v.element.tag) + */ + __pyx_r = 0; + goto __pyx_L0; + + /* "gumbocy.pyx":411 + * return 1 + * + * if self.tags_ignore.count(tag_n): # <<<<<<<<<<<<<< + * return 0 + * + */ + } + + /* "gumbocy.pyx":414 + * return 0 + * + * tag_name = gumbocy.gumbo_normalized_tagname(node.v.element.tag) # <<<<<<<<<<<<<< + * + * # When we find an unknown tag, find its tag_name in the buffer + */ + __pyx_v_tag_name = gumbo_normalized_tagname(__pyx_v_node->v.element.tag); + + /* "gumbocy.pyx":417 + * + * # When we find an unknown tag, find its tag_name in the buffer + * if tag_name == b"": # <<<<<<<<<<<<<< + * gsp = node.v.element.original_tag + * gumbo_tag_from_original_text(&gsp) + */ + __pyx_t_3 = __Pyx_PyBytes_FromString(__pyx_v_tag_name); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 417, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __pyx_t_1 = (__Pyx_PyBytes_Equals(__pyx_t_3, __pyx_kp_b__5, Py_EQ)); if (unlikely(__pyx_t_1 < 0)) __PYX_ERR(0, 417, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + if (__pyx_t_1) { + + /* "gumbocy.pyx":418 + * # When we find an unknown tag, find its tag_name in the buffer + * if tag_name == b"": + * gsp = node.v.element.original_tag # <<<<<<<<<<<<<< + * gumbo_tag_from_original_text(&gsp) + * py_tag_name = str(gsp.data)[0:gsp.length].lower() # TODO try to do that only in C! + */ + __pyx_t_4 = __pyx_v_node->v.element.original_tag; + __pyx_v_gsp = __pyx_t_4; + + /* "gumbocy.pyx":419 + * if tag_name == b"": + * gsp = node.v.element.original_tag + * gumbo_tag_from_original_text(&gsp) # <<<<<<<<<<<<<< + * py_tag_name = str(gsp.data)[0:gsp.length].lower() # TODO try to do that only in C! + * tag_name = py_tag_name + */ + gumbo_tag_from_original_text((&__pyx_v_gsp)); + + /* "gumbocy.pyx":420 + * gsp = node.v.element.original_tag + * gumbo_tag_from_original_text(&gsp) + * py_tag_name = str(gsp.data)[0:gsp.length].lower() # TODO try to do that only in C! # <<<<<<<<<<<<<< + * tag_name = py_tag_name + * + */ + __pyx_t_5 = __Pyx_PyBytes_FromString(__pyx_v_gsp.data); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 420, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_5); + __pyx_t_6 = PyTuple_New(1); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 420, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_6); + __Pyx_GIVEREF(__pyx_t_5); + PyTuple_SET_ITEM(__pyx_t_6, 0, __pyx_t_5); + __pyx_t_5 = 0; + __pyx_t_5 = __Pyx_PyObject_Call(((PyObject *)(&PyString_Type)), __pyx_t_6, NULL); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 420, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_5); + __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; + __pyx_t_6 = __Pyx_PyObject_GetSlice(__pyx_t_5, 0, __pyx_v_gsp.length, NULL, NULL, NULL, 1, 1, 1); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 420, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_6); + __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; + __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_t_6, __pyx_n_s_lower); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 420, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_5); + __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; + __pyx_t_6 = NULL; + if (CYTHON_COMPILING_IN_CPYTHON && likely(PyMethod_Check(__pyx_t_5))) { + __pyx_t_6 = PyMethod_GET_SELF(__pyx_t_5); + if (likely(__pyx_t_6)) { + PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_5); + __Pyx_INCREF(__pyx_t_6); + __Pyx_INCREF(function); + __Pyx_DECREF_SET(__pyx_t_5, function); + } + } + if (__pyx_t_6) { + __pyx_t_3 = __Pyx_PyObject_CallOneArg(__pyx_t_5, __pyx_t_6); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 420, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; + } else { + __pyx_t_3 = __Pyx_PyObject_CallNoArg(__pyx_t_5); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 420, __pyx_L1_error) + } + __Pyx_GOTREF(__pyx_t_3); + __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; + __pyx_v_py_tag_name = __pyx_t_3; + __pyx_t_3 = 0; + + /* "gumbocy.pyx":421 + * gumbo_tag_from_original_text(&gsp) + * py_tag_name = str(gsp.data)[0:gsp.length].lower() # TODO try to do that only in C! + * tag_name = py_tag_name # <<<<<<<<<<<<<< + * + * # if self.has_attributes_whitelist: + */ + __pyx_t_7 = __Pyx_PyObject_AsString(__pyx_v_py_tag_name); if (unlikely((!__pyx_t_7) && PyErr_Occurred())) __PYX_ERR(0, 421, __pyx_L1_error) + __pyx_v_tag_name = ((char const *)__pyx_t_7); + + /* "gumbocy.pyx":417 + * + * # When we find an unknown tag, find its tag_name in the buffer + * if tag_name == b"": # <<<<<<<<<<<<<< + * gsp = node.v.element.original_tag + * gumbo_tag_from_original_text(&gsp) + */ + } + + /* "gumbocy.pyx":425 + * # if self.has_attributes_whitelist: + * + * attrs = self.get_attributes(node) # <<<<<<<<<<<<<< + * + * if self.has_classes_ignore and attrs.size_classes > 0: + */ + __pyx_t_3 = ((PyObject *)((struct __pyx_vtabstruct_7gumbocy_HTMLParser *)__pyx_v_self->__pyx_vtab)->get_attributes(__pyx_v_self, __pyx_v_node)); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 425, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __pyx_v_attrs = ((struct __pyx_obj_7gumbocy_Attributes *)__pyx_t_3); + __pyx_t_3 = 0; + + /* "gumbocy.pyx":427 + * attrs = self.get_attributes(node) + * + * if self.has_classes_ignore and attrs.size_classes > 0: # <<<<<<<<<<<<<< + * for v in attrs.classes: + * if re2_search(v, deref(self.classes_ignore)): + */ + __pyx_t_2 = (__pyx_v_self->has_classes_ignore != 0); + if (__pyx_t_2) { + } else { + __pyx_t_1 = __pyx_t_2; + goto __pyx_L20_bool_binop_done; + } + __pyx_t_2 = ((__pyx_v_attrs->size_classes > 0) != 0); + __pyx_t_1 = __pyx_t_2; + __pyx_L20_bool_binop_done:; + if (__pyx_t_1) { + + /* "gumbocy.pyx":428 + * + * if self.has_classes_ignore and attrs.size_classes > 0: + * for v in attrs.classes: # <<<<<<<<<<<<<< + * if re2_search(v, deref(self.classes_ignore)): + * return 0 + */ + if (unlikely(__pyx_v_attrs->classes == Py_None)) { + PyErr_SetString(PyExc_TypeError, "'NoneType' object is not iterable"); + __PYX_ERR(0, 428, __pyx_L1_error) + } + __pyx_t_3 = __pyx_v_attrs->classes; __Pyx_INCREF(__pyx_t_3); __pyx_t_8 = 0; + for (;;) { + if (__pyx_t_8 >= PyList_GET_SIZE(__pyx_t_3)) break; + #if CYTHON_COMPILING_IN_CPYTHON + __pyx_t_5 = PyList_GET_ITEM(__pyx_t_3, __pyx_t_8); __Pyx_INCREF(__pyx_t_5); __pyx_t_8++; if (unlikely(0 < 0)) __PYX_ERR(0, 428, __pyx_L1_error) + #else + __pyx_t_5 = PySequence_ITEM(__pyx_t_3, __pyx_t_8); __pyx_t_8++; if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 428, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_5); + #endif + __Pyx_XDECREF_SET(__pyx_v_v, __pyx_t_5); + __pyx_t_5 = 0; + + /* "gumbocy.pyx":429 + * if self.has_classes_ignore and attrs.size_classes > 0: + * for v in attrs.classes: + * if re2_search(v, deref(self.classes_ignore)): # <<<<<<<<<<<<<< + * return 0 + * + */ + __pyx_t_9 = __Pyx_PyObject_AsString(__pyx_v_v); if (unlikely((!__pyx_t_9) && PyErr_Occurred())) __PYX_ERR(0, 429, __pyx_L1_error) + __pyx_t_1 = (__pyx_f_7gumbocy_re2_search(__pyx_t_9, (*__pyx_v_self->classes_ignore)) != 0); + if (__pyx_t_1) { + + /* "gumbocy.pyx":430 + * for v in attrs.classes: + * if re2_search(v, deref(self.classes_ignore)): + * return 0 # <<<<<<<<<<<<<< + * + * if self.has_ids_ignore and attrs.values.get(ATTR_ID): + */ + __pyx_r = 0; + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + goto __pyx_L0; + + /* "gumbocy.pyx":429 + * if self.has_classes_ignore and attrs.size_classes > 0: + * for v in attrs.classes: + * if re2_search(v, deref(self.classes_ignore)): # <<<<<<<<<<<<<< + * return 0 + * + */ + } + + /* "gumbocy.pyx":428 + * + * if self.has_classes_ignore and attrs.size_classes > 0: + * for v in attrs.classes: # <<<<<<<<<<<<<< + * if re2_search(v, deref(self.classes_ignore)): + * return 0 + */ + } + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + + /* "gumbocy.pyx":427 + * attrs = self.get_attributes(node) + * + * if self.has_classes_ignore and attrs.size_classes > 0: # <<<<<<<<<<<<<< + * for v in attrs.classes: + * if re2_search(v, deref(self.classes_ignore)): + */ + } + + /* "gumbocy.pyx":432 + * return 0 + * + * if self.has_ids_ignore and attrs.values.get(ATTR_ID): # <<<<<<<<<<<<<< + * if re2_search(attrs.values[ATTR_ID], deref(self.ids_ignore)): + * return 0 + */ + __pyx_t_2 = (__pyx_v_self->has_ids_ignore != 0); + if (__pyx_t_2) { + } else { + __pyx_t_1 = __pyx_t_2; + goto __pyx_L26_bool_binop_done; + } + if (unlikely(__pyx_v_attrs->values == Py_None)) { + PyErr_Format(PyExc_AttributeError, "'NoneType' object has no attribute '%s'", "get"); + __PYX_ERR(0, 432, __pyx_L1_error) + } + __pyx_t_3 = __Pyx_PyInt_From___pyx_t_7gumbocy_AttributeNames(__pyx_e_7gumbocy_ATTR_ID); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 432, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __pyx_t_5 = __Pyx_PyDict_GetItemDefault(__pyx_v_attrs->values, __pyx_t_3, Py_None); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 432, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_5); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __pyx_t_2 = __Pyx_PyObject_IsTrue(__pyx_t_5); if (unlikely(__pyx_t_2 < 0)) __PYX_ERR(0, 432, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; + __pyx_t_1 = __pyx_t_2; + __pyx_L26_bool_binop_done:; + if (__pyx_t_1) { + + /* "gumbocy.pyx":433 + * + * if self.has_ids_ignore and attrs.values.get(ATTR_ID): + * if re2_search(attrs.values[ATTR_ID], deref(self.ids_ignore)): # <<<<<<<<<<<<<< + * return 0 + * + */ + if (unlikely(__pyx_v_attrs->values == Py_None)) { + PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable"); + __PYX_ERR(0, 433, __pyx_L1_error) + } + __pyx_t_5 = __Pyx_PyInt_From___pyx_t_7gumbocy_AttributeNames(__pyx_e_7gumbocy_ATTR_ID); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 433, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_5); + __pyx_t_3 = __Pyx_PyDict_GetItem(__pyx_v_attrs->values, __pyx_t_5); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 433, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; + __pyx_t_9 = __Pyx_PyObject_AsString(__pyx_t_3); if (unlikely((!__pyx_t_9) && PyErr_Occurred())) __PYX_ERR(0, 433, __pyx_L1_error) + __pyx_t_1 = (__pyx_f_7gumbocy_re2_search(__pyx_t_9, (*__pyx_v_self->ids_ignore)) != 0); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + if (__pyx_t_1) { + + /* "gumbocy.pyx":434 + * if self.has_ids_ignore and attrs.values.get(ATTR_ID): + * if re2_search(attrs.values[ATTR_ID], deref(self.ids_ignore)): + * return 0 # <<<<<<<<<<<<<< + * + * if node.v.element.tag == gumbocy.GUMBO_TAG_TITLE: + */ + __pyx_r = 0; + goto __pyx_L0; + + /* "gumbocy.pyx":433 + * + * if self.has_ids_ignore and attrs.values.get(ATTR_ID): + * if re2_search(attrs.values[ATTR_ID], deref(self.ids_ignore)): # <<<<<<<<<<<<<< + * return 0 + * + */ + } + + /* "gumbocy.pyx":432 + * return 0 + * + * if self.has_ids_ignore and attrs.values.get(ATTR_ID): # <<<<<<<<<<<<<< + * if re2_search(attrs.values[ATTR_ID], deref(self.ids_ignore)): + * return 0 + */ + } + + /* "gumbocy.pyx":436 + * return 0 + * + * if node.v.element.tag == gumbocy.GUMBO_TAG_TITLE: # <<<<<<<<<<<<<< + * if not self.analysis.get("title"): + * if node.v.element.children.length > 0: + */ + __pyx_t_1 = ((__pyx_v_node->v.element.tag == GUMBO_TAG_TITLE) != 0); + if (__pyx_t_1) { + + /* "gumbocy.pyx":437 + * + * if node.v.element.tag == gumbocy.GUMBO_TAG_TITLE: + * if not self.analysis.get("title"): # <<<<<<<<<<<<<< + * if node.v.element.children.length > 0: + * first_child = node.v.element.children.data[0] + */ + if (unlikely(__pyx_v_self->analysis == Py_None)) { + PyErr_Format(PyExc_AttributeError, "'NoneType' object has no attribute '%s'", "get"); + __PYX_ERR(0, 437, __pyx_L1_error) + } + __pyx_t_3 = __Pyx_PyDict_GetItemDefault(__pyx_v_self->analysis, __pyx_n_s_title, Py_None); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 437, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __pyx_t_1 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_1 < 0)) __PYX_ERR(0, 437, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __pyx_t_2 = ((!__pyx_t_1) != 0); + if (__pyx_t_2) { + + /* "gumbocy.pyx":438 + * if node.v.element.tag == gumbocy.GUMBO_TAG_TITLE: + * if not self.analysis.get("title"): + * if node.v.element.children.length > 0: # <<<<<<<<<<<<<< + * first_child = node.v.element.children.data[0] + * if first_child.type == gumbocy.GUMBO_NODE_TEXT: + */ + __pyx_t_2 = ((__pyx_v_node->v.element.children.length > 0) != 0); + if (__pyx_t_2) { + + /* "gumbocy.pyx":439 + * if not self.analysis.get("title"): + * if node.v.element.children.length > 0: + * first_child = node.v.element.children.data[0] # <<<<<<<<<<<<<< + * if first_child.type == gumbocy.GUMBO_NODE_TEXT: + * self.analysis["title"] = first_child.v.text.text + */ + __pyx_v_first_child = ((GumboNode *)(__pyx_v_node->v.element.children.data[0])); + + /* "gumbocy.pyx":440 + * if node.v.element.children.length > 0: + * first_child = node.v.element.children.data[0] + * if first_child.type == gumbocy.GUMBO_NODE_TEXT: # <<<<<<<<<<<<<< + * self.analysis["title"] = first_child.v.text.text + * return 0 + */ + __pyx_t_2 = ((__pyx_v_first_child->type == GUMBO_NODE_TEXT) != 0); + if (__pyx_t_2) { + + /* "gumbocy.pyx":441 + * first_child = node.v.element.children.data[0] + * if first_child.type == gumbocy.GUMBO_NODE_TEXT: + * self.analysis["title"] = first_child.v.text.text # <<<<<<<<<<<<<< + * return 0 + * + */ + __pyx_t_3 = __Pyx_PyBytes_FromString(__pyx_v_first_child->v.text.text); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 441, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + if (unlikely(__pyx_v_self->analysis == Py_None)) { + PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable"); + __PYX_ERR(0, 441, __pyx_L1_error) + } + if (unlikely(PyDict_SetItem(__pyx_v_self->analysis, __pyx_n_s_title, __pyx_t_3) < 0)) __PYX_ERR(0, 441, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + + /* "gumbocy.pyx":440 + * if node.v.element.children.length > 0: + * first_child = node.v.element.children.data[0] + * if first_child.type == gumbocy.GUMBO_NODE_TEXT: # <<<<<<<<<<<<<< + * self.analysis["title"] = first_child.v.text.text + * return 0 + */ + } + + /* "gumbocy.pyx":438 + * if node.v.element.tag == gumbocy.GUMBO_TAG_TITLE: + * if not self.analysis.get("title"): + * if node.v.element.children.length > 0: # <<<<<<<<<<<<<< + * first_child = node.v.element.children.data[0] + * if first_child.type == gumbocy.GUMBO_NODE_TEXT: + */ + } + + /* "gumbocy.pyx":437 + * + * if node.v.element.tag == gumbocy.GUMBO_TAG_TITLE: + * if not self.analysis.get("title"): # <<<<<<<<<<<<<< + * if node.v.element.children.length > 0: + * first_child = node.v.element.children.data[0] + */ + } + + /* "gumbocy.pyx":442 + * if first_child.type == gumbocy.GUMBO_NODE_TEXT: + * self.analysis["title"] = first_child.v.text.text + * return 0 # <<<<<<<<<<<<<< + * + * self.current_stack.append(tag_name) + */ + __pyx_r = 0; + goto __pyx_L0; + + /* "gumbocy.pyx":436 + * return 0 + * + * if node.v.element.tag == gumbocy.GUMBO_TAG_TITLE: # <<<<<<<<<<<<<< + * if not self.analysis.get("title"): + * if node.v.element.children.length > 0: + */ + } + + /* "gumbocy.pyx":444 + * return 0 + * + * self.current_stack.append(tag_name) # <<<<<<<<<<<<<< + * + * if node.v.element.tag == gumbocy.GUMBO_TAG_HEAD: + */ + if (unlikely(__pyx_v_self->current_stack == Py_None)) { + PyErr_Format(PyExc_AttributeError, "'NoneType' object has no attribute '%s'", "append"); + __PYX_ERR(0, 444, __pyx_L1_error) + } + __pyx_t_3 = __Pyx_PyBytes_FromString(__pyx_v_tag_name); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 444, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __pyx_t_10 = __Pyx_PyList_Append(__pyx_v_self->current_stack, __pyx_t_3); if (unlikely(__pyx_t_10 == -1)) __PYX_ERR(0, 444, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + + /* "gumbocy.pyx":446 + * self.current_stack.append(tag_name) + * + * if node.v.element.tag == gumbocy.GUMBO_TAG_HEAD: # <<<<<<<<<<<<<< + * is_head = 1 + * + */ + __pyx_t_2 = ((__pyx_v_node->v.element.tag == GUMBO_TAG_HEAD) != 0); + if (__pyx_t_2) { + + /* "gumbocy.pyx":447 + * + * if node.v.element.tag == gumbocy.GUMBO_TAG_HEAD: + * is_head = 1 # <<<<<<<<<<<<<< + * + * elif node.v.element.tag == gumbocy.GUMBO_TAG_A: + */ + __pyx_v_is_head = 1; + + /* "gumbocy.pyx":446 + * self.current_stack.append(tag_name) + * + * if node.v.element.tag == gumbocy.GUMBO_TAG_HEAD: # <<<<<<<<<<<<<< + * is_head = 1 + * + */ + goto __pyx_L33; + } + + /* "gumbocy.pyx":449 + * is_head = 1 + * + * elif node.v.element.tag == gumbocy.GUMBO_TAG_A: # <<<<<<<<<<<<<< + * self.open_hyperlink(attrs) + * is_hyperlink = 1 + */ + __pyx_t_2 = ((__pyx_v_node->v.element.tag == GUMBO_TAG_A) != 0); + if (__pyx_t_2) { + + /* "gumbocy.pyx":450 + * + * elif node.v.element.tag == gumbocy.GUMBO_TAG_A: + * self.open_hyperlink(attrs) # <<<<<<<<<<<<<< + * is_hyperlink = 1 + * + */ + ((struct __pyx_vtabstruct_7gumbocy_HTMLParser *)__pyx_v_self->__pyx_vtab)->open_hyperlink(__pyx_v_self, __pyx_v_attrs); + + /* "gumbocy.pyx":451 + * elif node.v.element.tag == gumbocy.GUMBO_TAG_A: + * self.open_hyperlink(attrs) + * is_hyperlink = 1 # <<<<<<<<<<<<<< + * + * elif node.v.element.tag == gumbocy.GUMBO_TAG_IMG: + */ + __pyx_v_is_hyperlink = 1; + + /* "gumbocy.pyx":449 + * is_head = 1 + * + * elif node.v.element.tag == gumbocy.GUMBO_TAG_A: # <<<<<<<<<<<<<< + * self.open_hyperlink(attrs) + * is_hyperlink = 1 + */ + goto __pyx_L33; + } + + /* "gumbocy.pyx":453 + * is_hyperlink = 1 + * + * elif node.v.element.tag == gumbocy.GUMBO_TAG_IMG: # <<<<<<<<<<<<<< + * self.close_word_group() + * if attrs.values.get(ATTR_ALT): + */ + __pyx_t_2 = ((__pyx_v_node->v.element.tag == GUMBO_TAG_IMG) != 0); + if (__pyx_t_2) { + + /* "gumbocy.pyx":454 + * + * elif node.v.element.tag == gumbocy.GUMBO_TAG_IMG: + * self.close_word_group() # <<<<<<<<<<<<<< + * if attrs.values.get(ATTR_ALT): + * self.add_text(attrs.values[ATTR_ALT]) + */ + ((struct __pyx_vtabstruct_7gumbocy_HTMLParser *)__pyx_v_self->__pyx_vtab)->close_word_group(__pyx_v_self); + + /* "gumbocy.pyx":455 + * elif node.v.element.tag == gumbocy.GUMBO_TAG_IMG: + * self.close_word_group() + * if attrs.values.get(ATTR_ALT): # <<<<<<<<<<<<<< + * self.add_text(attrs.values[ATTR_ALT]) + * self.close_word_group() + */ + if (unlikely(__pyx_v_attrs->values == Py_None)) { + PyErr_Format(PyExc_AttributeError, "'NoneType' object has no attribute '%s'", "get"); + __PYX_ERR(0, 455, __pyx_L1_error) + } + __pyx_t_3 = __Pyx_PyInt_From___pyx_t_7gumbocy_AttributeNames(__pyx_e_7gumbocy_ATTR_ALT); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 455, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __pyx_t_5 = __Pyx_PyDict_GetItemDefault(__pyx_v_attrs->values, __pyx_t_3, Py_None); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 455, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_5); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __pyx_t_2 = __Pyx_PyObject_IsTrue(__pyx_t_5); if (unlikely(__pyx_t_2 < 0)) __PYX_ERR(0, 455, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; + if (__pyx_t_2) { + + /* "gumbocy.pyx":456 + * self.close_word_group() + * if attrs.values.get(ATTR_ALT): + * self.add_text(attrs.values[ATTR_ALT]) # <<<<<<<<<<<<<< + * self.close_word_group() + * + */ + if (unlikely(__pyx_v_attrs->values == Py_None)) { + PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable"); + __PYX_ERR(0, 456, __pyx_L1_error) + } + __pyx_t_5 = __Pyx_PyInt_From___pyx_t_7gumbocy_AttributeNames(__pyx_e_7gumbocy_ATTR_ALT); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 456, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_5); + __pyx_t_3 = __Pyx_PyDict_GetItem(__pyx_v_attrs->values, __pyx_t_5); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 456, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; + ((struct __pyx_vtabstruct_7gumbocy_HTMLParser *)__pyx_v_self->__pyx_vtab)->add_text(__pyx_v_self, __pyx_t_3); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + + /* "gumbocy.pyx":457 + * if attrs.values.get(ATTR_ALT): + * self.add_text(attrs.values[ATTR_ALT]) + * self.close_word_group() # <<<<<<<<<<<<<< + * + * # Text extraction from image filenames disabled for now + */ + ((struct __pyx_vtabstruct_7gumbocy_HTMLParser *)__pyx_v_self->__pyx_vtab)->close_word_group(__pyx_v_self); + + /* "gumbocy.pyx":455 + * elif node.v.element.tag == gumbocy.GUMBO_TAG_IMG: + * self.close_word_group() + * if attrs.values.get(ATTR_ALT): # <<<<<<<<<<<<<< + * self.add_text(attrs.values[ATTR_ALT]) + * self.close_word_group() + */ + } + + /* "gumbocy.pyx":453 + * is_hyperlink = 1 + * + * elif node.v.element.tag == gumbocy.GUMBO_TAG_IMG: # <<<<<<<<<<<<<< + * self.close_word_group() + * if attrs.values.get(ATTR_ALT): + */ + } + __pyx_L33:; + + /* "gumbocy.pyx":466 + * + * + * if is_head: # <<<<<<<<<<<<<< + * if node.v.element.tag == gumbocy.GUMBO_TAG_LINK: + * + */ + __pyx_t_2 = (__pyx_v_is_head != 0); + if (__pyx_t_2) { + + /* "gumbocy.pyx":467 + * + * if is_head: + * if node.v.element.tag == gumbocy.GUMBO_TAG_LINK: # <<<<<<<<<<<<<< + * + * # TODO: more properties + */ + __pyx_t_2 = ((__pyx_v_node->v.element.tag == GUMBO_TAG_LINK) != 0); + if (__pyx_t_2) { + + /* "gumbocy.pyx":470 + * + * # TODO: more properties + * if attrs.values.get(ATTR_REL) and attrs.values.get(ATTR_HREF): # <<<<<<<<<<<<<< + * self.analysis.setdefault("head_links", []) + * self.analysis["head_links"].append({"rel": attrs.values[ATTR_REL], "href": attrs.values[ATTR_HREF]}) + */ + if (unlikely(__pyx_v_attrs->values == Py_None)) { + PyErr_Format(PyExc_AttributeError, "'NoneType' object has no attribute '%s'", "get"); + __PYX_ERR(0, 470, __pyx_L1_error) + } + __pyx_t_3 = __Pyx_PyInt_From___pyx_t_7gumbocy_AttributeNames(__pyx_e_7gumbocy_ATTR_REL); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 470, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __pyx_t_5 = __Pyx_PyDict_GetItemDefault(__pyx_v_attrs->values, __pyx_t_3, Py_None); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 470, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_5); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __pyx_t_1 = __Pyx_PyObject_IsTrue(__pyx_t_5); if (unlikely(__pyx_t_1 < 0)) __PYX_ERR(0, 470, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; + if (__pyx_t_1) { + } else { + __pyx_t_2 = __pyx_t_1; + goto __pyx_L38_bool_binop_done; + } + if (unlikely(__pyx_v_attrs->values == Py_None)) { + PyErr_Format(PyExc_AttributeError, "'NoneType' object has no attribute '%s'", "get"); + __PYX_ERR(0, 470, __pyx_L1_error) + } + __pyx_t_5 = __Pyx_PyInt_From___pyx_t_7gumbocy_AttributeNames(__pyx_e_7gumbocy_ATTR_HREF); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 470, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_5); + __pyx_t_3 = __Pyx_PyDict_GetItemDefault(__pyx_v_attrs->values, __pyx_t_5, Py_None); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 470, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; + __pyx_t_1 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_1 < 0)) __PYX_ERR(0, 470, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __pyx_t_2 = __pyx_t_1; + __pyx_L38_bool_binop_done:; + if (__pyx_t_2) { + + /* "gumbocy.pyx":471 + * # TODO: more properties + * if attrs.values.get(ATTR_REL) and attrs.values.get(ATTR_HREF): + * self.analysis.setdefault("head_links", []) # <<<<<<<<<<<<<< + * self.analysis["head_links"].append({"rel": attrs.values[ATTR_REL], "href": attrs.values[ATTR_HREF]}) + * + */ + if (unlikely(__pyx_v_self->analysis == Py_None)) { + PyErr_Format(PyExc_AttributeError, "'NoneType' object has no attribute '%s'", "setdefault"); + __PYX_ERR(0, 471, __pyx_L1_error) + } + __pyx_t_3 = PyList_New(0); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 471, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __pyx_t_5 = __Pyx_PyDict_SetDefault(__pyx_v_self->analysis, __pyx_n_s_head_links, __pyx_t_3, 1); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 471, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_5); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; + + /* "gumbocy.pyx":472 + * if attrs.values.get(ATTR_REL) and attrs.values.get(ATTR_HREF): + * self.analysis.setdefault("head_links", []) + * self.analysis["head_links"].append({"rel": attrs.values[ATTR_REL], "href": attrs.values[ATTR_HREF]}) # <<<<<<<<<<<<<< + * + * elif self.has_metas_whitelist and node.v.element.tag == gumbocy.GUMBO_TAG_META: + */ + if (unlikely(__pyx_v_self->analysis == Py_None)) { + PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable"); + __PYX_ERR(0, 472, __pyx_L1_error) + } + __pyx_t_5 = __Pyx_PyDict_GetItem(__pyx_v_self->analysis, __pyx_n_s_head_links); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 472, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_5); + __pyx_t_3 = PyDict_New(); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 472, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + if (unlikely(__pyx_v_attrs->values == Py_None)) { + PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable"); + __PYX_ERR(0, 472, __pyx_L1_error) + } + __pyx_t_6 = __Pyx_PyInt_From___pyx_t_7gumbocy_AttributeNames(__pyx_e_7gumbocy_ATTR_REL); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 472, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_6); + __pyx_t_11 = __Pyx_PyDict_GetItem(__pyx_v_attrs->values, __pyx_t_6); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 472, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_11); + __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; + if (PyDict_SetItem(__pyx_t_3, __pyx_n_s_rel, __pyx_t_11) < 0) __PYX_ERR(0, 472, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_11); __pyx_t_11 = 0; + if (unlikely(__pyx_v_attrs->values == Py_None)) { + PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable"); + __PYX_ERR(0, 472, __pyx_L1_error) + } + __pyx_t_11 = __Pyx_PyInt_From___pyx_t_7gumbocy_AttributeNames(__pyx_e_7gumbocy_ATTR_HREF); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 472, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_11); + __pyx_t_6 = __Pyx_PyDict_GetItem(__pyx_v_attrs->values, __pyx_t_11); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 472, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_6); + __Pyx_DECREF(__pyx_t_11); __pyx_t_11 = 0; + if (PyDict_SetItem(__pyx_t_3, __pyx_n_s_href, __pyx_t_6) < 0) __PYX_ERR(0, 472, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; + __pyx_t_10 = __Pyx_PyObject_Append(__pyx_t_5, __pyx_t_3); if (unlikely(__pyx_t_10 == -1)) __PYX_ERR(0, 472, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + + /* "gumbocy.pyx":470 + * + * # TODO: more properties + * if attrs.values.get(ATTR_REL) and attrs.values.get(ATTR_HREF): # <<<<<<<<<<<<<< + * self.analysis.setdefault("head_links", []) + * self.analysis["head_links"].append({"rel": attrs.values[ATTR_REL], "href": attrs.values[ATTR_HREF]}) + */ + } + + /* "gumbocy.pyx":467 + * + * if is_head: + * if node.v.element.tag == gumbocy.GUMBO_TAG_LINK: # <<<<<<<<<<<<<< + * + * # TODO: more properties + */ + goto __pyx_L36; + } + + /* "gumbocy.pyx":474 + * self.analysis["head_links"].append({"rel": attrs.values[ATTR_REL], "href": attrs.values[ATTR_HREF]}) + * + * elif self.has_metas_whitelist and node.v.element.tag == gumbocy.GUMBO_TAG_META: # <<<<<<<<<<<<<< + * + * if attrs.values.get(ATTR_CONTENT): + */ + __pyx_t_1 = (__pyx_v_self->has_metas_whitelist != 0); + if (__pyx_t_1) { + } else { + __pyx_t_2 = __pyx_t_1; + goto __pyx_L40_bool_binop_done; + } + __pyx_t_1 = ((__pyx_v_node->v.element.tag == GUMBO_TAG_META) != 0); + __pyx_t_2 = __pyx_t_1; + __pyx_L40_bool_binop_done:; + if (__pyx_t_2) { + + /* "gumbocy.pyx":476 + * elif self.has_metas_whitelist and node.v.element.tag == gumbocy.GUMBO_TAG_META: + * + * if attrs.values.get(ATTR_CONTENT): # <<<<<<<<<<<<<< + * + * if attrs.values.get(ATTR_NAME): + */ + if (unlikely(__pyx_v_attrs->values == Py_None)) { + PyErr_Format(PyExc_AttributeError, "'NoneType' object has no attribute '%s'", "get"); + __PYX_ERR(0, 476, __pyx_L1_error) + } + __pyx_t_3 = __Pyx_PyInt_From___pyx_t_7gumbocy_AttributeNames(__pyx_e_7gumbocy_ATTR_CONTENT); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 476, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __pyx_t_5 = __Pyx_PyDict_GetItemDefault(__pyx_v_attrs->values, __pyx_t_3, Py_None); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 476, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_5); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __pyx_t_2 = __Pyx_PyObject_IsTrue(__pyx_t_5); if (unlikely(__pyx_t_2 < 0)) __PYX_ERR(0, 476, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; + if (__pyx_t_2) { + + /* "gumbocy.pyx":478 + * if attrs.values.get(ATTR_CONTENT): + * + * if attrs.values.get(ATTR_NAME): # <<<<<<<<<<<<<< + * if re2_search(attrs.values[ATTR_NAME], deref(self.metas_whitelist)): + * self.analysis.setdefault("head_metas", {}) + */ + if (unlikely(__pyx_v_attrs->values == Py_None)) { + PyErr_Format(PyExc_AttributeError, "'NoneType' object has no attribute '%s'", "get"); + __PYX_ERR(0, 478, __pyx_L1_error) + } + __pyx_t_5 = __Pyx_PyInt_From___pyx_t_7gumbocy_AttributeNames(__pyx_e_7gumbocy_ATTR_NAME); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 478, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_5); + __pyx_t_3 = __Pyx_PyDict_GetItemDefault(__pyx_v_attrs->values, __pyx_t_5, Py_None); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 478, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; + __pyx_t_2 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_2 < 0)) __PYX_ERR(0, 478, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + if (__pyx_t_2) { + + /* "gumbocy.pyx":479 + * + * if attrs.values.get(ATTR_NAME): + * if re2_search(attrs.values[ATTR_NAME], deref(self.metas_whitelist)): # <<<<<<<<<<<<<< + * self.analysis.setdefault("head_metas", {}) + * self.analysis["head_metas"][attrs.values[ATTR_NAME]] = str(attrs.values[ATTR_CONTENT]).strip() + */ + if (unlikely(__pyx_v_attrs->values == Py_None)) { + PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable"); + __PYX_ERR(0, 479, __pyx_L1_error) + } + __pyx_t_3 = __Pyx_PyInt_From___pyx_t_7gumbocy_AttributeNames(__pyx_e_7gumbocy_ATTR_NAME); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 479, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __pyx_t_5 = __Pyx_PyDict_GetItem(__pyx_v_attrs->values, __pyx_t_3); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 479, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_5); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __pyx_t_9 = __Pyx_PyObject_AsString(__pyx_t_5); if (unlikely((!__pyx_t_9) && PyErr_Occurred())) __PYX_ERR(0, 479, __pyx_L1_error) + __pyx_t_2 = (__pyx_f_7gumbocy_re2_search(__pyx_t_9, (*__pyx_v_self->metas_whitelist)) != 0); + __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; + if (__pyx_t_2) { + + /* "gumbocy.pyx":480 + * if attrs.values.get(ATTR_NAME): + * if re2_search(attrs.values[ATTR_NAME], deref(self.metas_whitelist)): + * self.analysis.setdefault("head_metas", {}) # <<<<<<<<<<<<<< + * self.analysis["head_metas"][attrs.values[ATTR_NAME]] = str(attrs.values[ATTR_CONTENT]).strip() + * + */ + if (unlikely(__pyx_v_self->analysis == Py_None)) { + PyErr_Format(PyExc_AttributeError, "'NoneType' object has no attribute '%s'", "setdefault"); + __PYX_ERR(0, 480, __pyx_L1_error) + } + __pyx_t_5 = PyDict_New(); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 480, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_5); + __pyx_t_3 = __Pyx_PyDict_SetDefault(__pyx_v_self->analysis, __pyx_n_s_head_metas, __pyx_t_5, 1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 480, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + + /* "gumbocy.pyx":481 + * if re2_search(attrs.values[ATTR_NAME], deref(self.metas_whitelist)): + * self.analysis.setdefault("head_metas", {}) + * self.analysis["head_metas"][attrs.values[ATTR_NAME]] = str(attrs.values[ATTR_CONTENT]).strip() # <<<<<<<<<<<<<< + * + * elif attrs.values.get(ATTR_PROPERTY): + */ + if (unlikely(__pyx_v_attrs->values == Py_None)) { + PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable"); + __PYX_ERR(0, 481, __pyx_L1_error) + } + __pyx_t_5 = __Pyx_PyInt_From___pyx_t_7gumbocy_AttributeNames(__pyx_e_7gumbocy_ATTR_CONTENT); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 481, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_5); + __pyx_t_6 = __Pyx_PyDict_GetItem(__pyx_v_attrs->values, __pyx_t_5); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 481, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_6); + __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; + __pyx_t_5 = PyTuple_New(1); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 481, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_5); + __Pyx_GIVEREF(__pyx_t_6); + PyTuple_SET_ITEM(__pyx_t_5, 0, __pyx_t_6); + __pyx_t_6 = 0; + __pyx_t_6 = __Pyx_PyObject_Call(((PyObject *)(&PyString_Type)), __pyx_t_5, NULL); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 481, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_6); + __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; + __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_t_6, __pyx_n_s_strip); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 481, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_5); + __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; + __pyx_t_6 = NULL; + if (CYTHON_COMPILING_IN_CPYTHON && likely(PyMethod_Check(__pyx_t_5))) { + __pyx_t_6 = PyMethod_GET_SELF(__pyx_t_5); + if (likely(__pyx_t_6)) { + PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_5); + __Pyx_INCREF(__pyx_t_6); + __Pyx_INCREF(function); + __Pyx_DECREF_SET(__pyx_t_5, function); + } + } + if (__pyx_t_6) { + __pyx_t_3 = __Pyx_PyObject_CallOneArg(__pyx_t_5, __pyx_t_6); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 481, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; + } else { + __pyx_t_3 = __Pyx_PyObject_CallNoArg(__pyx_t_5); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 481, __pyx_L1_error) + } + __Pyx_GOTREF(__pyx_t_3); + __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; + if (unlikely(__pyx_v_self->analysis == Py_None)) { + PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable"); + __PYX_ERR(0, 481, __pyx_L1_error) + } + __pyx_t_5 = __Pyx_PyDict_GetItem(__pyx_v_self->analysis, __pyx_n_s_head_metas); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 481, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_5); + if (unlikely(__pyx_v_attrs->values == Py_None)) { + PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable"); + __PYX_ERR(0, 481, __pyx_L1_error) + } + __pyx_t_6 = __Pyx_PyInt_From___pyx_t_7gumbocy_AttributeNames(__pyx_e_7gumbocy_ATTR_NAME); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 481, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_6); + __pyx_t_11 = __Pyx_PyDict_GetItem(__pyx_v_attrs->values, __pyx_t_6); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 481, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_11); + __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; + if (unlikely(PyObject_SetItem(__pyx_t_5, __pyx_t_11, __pyx_t_3) < 0)) __PYX_ERR(0, 481, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; + __Pyx_DECREF(__pyx_t_11); __pyx_t_11 = 0; + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + + /* "gumbocy.pyx":479 + * + * if attrs.values.get(ATTR_NAME): + * if re2_search(attrs.values[ATTR_NAME], deref(self.metas_whitelist)): # <<<<<<<<<<<<<< + * self.analysis.setdefault("head_metas", {}) + * self.analysis["head_metas"][attrs.values[ATTR_NAME]] = str(attrs.values[ATTR_CONTENT]).strip() + */ + } + + /* "gumbocy.pyx":478 + * if attrs.values.get(ATTR_CONTENT): + * + * if attrs.values.get(ATTR_NAME): # <<<<<<<<<<<<<< + * if re2_search(attrs.values[ATTR_NAME], deref(self.metas_whitelist)): + * self.analysis.setdefault("head_metas", {}) + */ + goto __pyx_L43; + } + + /* "gumbocy.pyx":483 + * self.analysis["head_metas"][attrs.values[ATTR_NAME]] = str(attrs.values[ATTR_CONTENT]).strip() + * + * elif attrs.values.get(ATTR_PROPERTY): # <<<<<<<<<<<<<< + * if re2_search(attrs.values[ATTR_PROPERTY], deref(self.metas_whitelist)): + * self.analysis.setdefault("head_metas", {}) + */ + if (unlikely(__pyx_v_attrs->values == Py_None)) { + PyErr_Format(PyExc_AttributeError, "'NoneType' object has no attribute '%s'", "get"); + __PYX_ERR(0, 483, __pyx_L1_error) + } + __pyx_t_3 = __Pyx_PyInt_From___pyx_t_7gumbocy_AttributeNames(__pyx_e_7gumbocy_ATTR_PROPERTY); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 483, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __pyx_t_11 = __Pyx_PyDict_GetItemDefault(__pyx_v_attrs->values, __pyx_t_3, Py_None); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 483, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_11); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __pyx_t_2 = __Pyx_PyObject_IsTrue(__pyx_t_11); if (unlikely(__pyx_t_2 < 0)) __PYX_ERR(0, 483, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_11); __pyx_t_11 = 0; + if (__pyx_t_2) { + + /* "gumbocy.pyx":484 + * + * elif attrs.values.get(ATTR_PROPERTY): + * if re2_search(attrs.values[ATTR_PROPERTY], deref(self.metas_whitelist)): # <<<<<<<<<<<<<< + * self.analysis.setdefault("head_metas", {}) + * self.analysis["head_metas"][attrs.values[ATTR_PROPERTY]] = str(attrs.values[ATTR_CONTENT]).strip() + */ + if (unlikely(__pyx_v_attrs->values == Py_None)) { + PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable"); + __PYX_ERR(0, 484, __pyx_L1_error) + } + __pyx_t_11 = __Pyx_PyInt_From___pyx_t_7gumbocy_AttributeNames(__pyx_e_7gumbocy_ATTR_PROPERTY); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 484, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_11); + __pyx_t_3 = __Pyx_PyDict_GetItem(__pyx_v_attrs->values, __pyx_t_11); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 484, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __Pyx_DECREF(__pyx_t_11); __pyx_t_11 = 0; + __pyx_t_9 = __Pyx_PyObject_AsString(__pyx_t_3); if (unlikely((!__pyx_t_9) && PyErr_Occurred())) __PYX_ERR(0, 484, __pyx_L1_error) + __pyx_t_2 = (__pyx_f_7gumbocy_re2_search(__pyx_t_9, (*__pyx_v_self->metas_whitelist)) != 0); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + if (__pyx_t_2) { + + /* "gumbocy.pyx":485 + * elif attrs.values.get(ATTR_PROPERTY): + * if re2_search(attrs.values[ATTR_PROPERTY], deref(self.metas_whitelist)): + * self.analysis.setdefault("head_metas", {}) # <<<<<<<<<<<<<< + * self.analysis["head_metas"][attrs.values[ATTR_PROPERTY]] = str(attrs.values[ATTR_CONTENT]).strip() + * + */ + if (unlikely(__pyx_v_self->analysis == Py_None)) { + PyErr_Format(PyExc_AttributeError, "'NoneType' object has no attribute '%s'", "setdefault"); + __PYX_ERR(0, 485, __pyx_L1_error) + } + __pyx_t_3 = PyDict_New(); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 485, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __pyx_t_11 = __Pyx_PyDict_SetDefault(__pyx_v_self->analysis, __pyx_n_s_head_metas, __pyx_t_3, 1); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 485, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_11); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __Pyx_DECREF(__pyx_t_11); __pyx_t_11 = 0; + + /* "gumbocy.pyx":486 + * if re2_search(attrs.values[ATTR_PROPERTY], deref(self.metas_whitelist)): + * self.analysis.setdefault("head_metas", {}) + * self.analysis["head_metas"][attrs.values[ATTR_PROPERTY]] = str(attrs.values[ATTR_CONTENT]).strip() # <<<<<<<<<<<<<< + * + * elif node.v.element.tag == gumbocy.GUMBO_TAG_BASE: + */ + if (unlikely(__pyx_v_attrs->values == Py_None)) { + PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable"); + __PYX_ERR(0, 486, __pyx_L1_error) + } + __pyx_t_3 = __Pyx_PyInt_From___pyx_t_7gumbocy_AttributeNames(__pyx_e_7gumbocy_ATTR_CONTENT); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 486, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __pyx_t_5 = __Pyx_PyDict_GetItem(__pyx_v_attrs->values, __pyx_t_3); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 486, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_5); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __pyx_t_3 = PyTuple_New(1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 486, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __Pyx_GIVEREF(__pyx_t_5); + PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_t_5); + __pyx_t_5 = 0; + __pyx_t_5 = __Pyx_PyObject_Call(((PyObject *)(&PyString_Type)), __pyx_t_3, NULL); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 486, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_5); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_5, __pyx_n_s_strip); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 486, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; + __pyx_t_5 = NULL; + if (CYTHON_COMPILING_IN_CPYTHON && likely(PyMethod_Check(__pyx_t_3))) { + __pyx_t_5 = PyMethod_GET_SELF(__pyx_t_3); + if (likely(__pyx_t_5)) { + PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_3); + __Pyx_INCREF(__pyx_t_5); + __Pyx_INCREF(function); + __Pyx_DECREF_SET(__pyx_t_3, function); + } + } + if (__pyx_t_5) { + __pyx_t_11 = __Pyx_PyObject_CallOneArg(__pyx_t_3, __pyx_t_5); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 486, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; + } else { + __pyx_t_11 = __Pyx_PyObject_CallNoArg(__pyx_t_3); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 486, __pyx_L1_error) + } + __Pyx_GOTREF(__pyx_t_11); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + if (unlikely(__pyx_v_self->analysis == Py_None)) { + PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable"); + __PYX_ERR(0, 486, __pyx_L1_error) + } + __pyx_t_3 = __Pyx_PyDict_GetItem(__pyx_v_self->analysis, __pyx_n_s_head_metas); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 486, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + if (unlikely(__pyx_v_attrs->values == Py_None)) { + PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable"); + __PYX_ERR(0, 486, __pyx_L1_error) + } + __pyx_t_5 = __Pyx_PyInt_From___pyx_t_7gumbocy_AttributeNames(__pyx_e_7gumbocy_ATTR_PROPERTY); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 486, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_5); + __pyx_t_6 = __Pyx_PyDict_GetItem(__pyx_v_attrs->values, __pyx_t_5); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 486, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_6); + __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; + if (unlikely(PyObject_SetItem(__pyx_t_3, __pyx_t_6, __pyx_t_11) < 0)) __PYX_ERR(0, 486, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; + __Pyx_DECREF(__pyx_t_11); __pyx_t_11 = 0; + + /* "gumbocy.pyx":484 + * + * elif attrs.values.get(ATTR_PROPERTY): + * if re2_search(attrs.values[ATTR_PROPERTY], deref(self.metas_whitelist)): # <<<<<<<<<<<<<< + * self.analysis.setdefault("head_metas", {}) + * self.analysis["head_metas"][attrs.values[ATTR_PROPERTY]] = str(attrs.values[ATTR_CONTENT]).strip() + */ + } + + /* "gumbocy.pyx":483 + * self.analysis["head_metas"][attrs.values[ATTR_NAME]] = str(attrs.values[ATTR_CONTENT]).strip() + * + * elif attrs.values.get(ATTR_PROPERTY): # <<<<<<<<<<<<<< + * if re2_search(attrs.values[ATTR_PROPERTY], deref(self.metas_whitelist)): + * self.analysis.setdefault("head_metas", {}) + */ + } + __pyx_L43:; + + /* "gumbocy.pyx":476 + * elif self.has_metas_whitelist and node.v.element.tag == gumbocy.GUMBO_TAG_META: + * + * if attrs.values.get(ATTR_CONTENT): # <<<<<<<<<<<<<< + * + * if attrs.values.get(ATTR_NAME): + */ + } + + /* "gumbocy.pyx":474 + * self.analysis["head_links"].append({"rel": attrs.values[ATTR_REL], "href": attrs.values[ATTR_HREF]}) + * + * elif self.has_metas_whitelist and node.v.element.tag == gumbocy.GUMBO_TAG_META: # <<<<<<<<<<<<<< + * + * if attrs.values.get(ATTR_CONTENT): + */ + goto __pyx_L36; + } + + /* "gumbocy.pyx":488 + * self.analysis["head_metas"][attrs.values[ATTR_PROPERTY]] = str(attrs.values[ATTR_CONTENT]).strip() + * + * elif node.v.element.tag == gumbocy.GUMBO_TAG_BASE: # <<<<<<<<<<<<<< + * if attrs.values.get(ATTR_HREF) and "base_url" not in self.analysis: + * self.analysis["base_url"] = attrs.values[ATTR_HREF] + */ + __pyx_t_2 = ((__pyx_v_node->v.element.tag == GUMBO_TAG_BASE) != 0); + if (__pyx_t_2) { + + /* "gumbocy.pyx":489 + * + * elif node.v.element.tag == gumbocy.GUMBO_TAG_BASE: + * if attrs.values.get(ATTR_HREF) and "base_url" not in self.analysis: # <<<<<<<<<<<<<< + * self.analysis["base_url"] = attrs.values[ATTR_HREF] + * + */ + if (unlikely(__pyx_v_attrs->values == Py_None)) { + PyErr_Format(PyExc_AttributeError, "'NoneType' object has no attribute '%s'", "get"); + __PYX_ERR(0, 489, __pyx_L1_error) + } + __pyx_t_11 = __Pyx_PyInt_From___pyx_t_7gumbocy_AttributeNames(__pyx_e_7gumbocy_ATTR_HREF); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 489, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_11); + __pyx_t_6 = __Pyx_PyDict_GetItemDefault(__pyx_v_attrs->values, __pyx_t_11, Py_None); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 489, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_6); + __Pyx_DECREF(__pyx_t_11); __pyx_t_11 = 0; + __pyx_t_1 = __Pyx_PyObject_IsTrue(__pyx_t_6); if (unlikely(__pyx_t_1 < 0)) __PYX_ERR(0, 489, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; + if (__pyx_t_1) { + } else { + __pyx_t_2 = __pyx_t_1; + goto __pyx_L47_bool_binop_done; + } + if (unlikely(__pyx_v_self->analysis == Py_None)) { + PyErr_SetString(PyExc_TypeError, "'NoneType' object is not iterable"); + __PYX_ERR(0, 489, __pyx_L1_error) + } + __pyx_t_1 = (__Pyx_PyDict_ContainsTF(__pyx_n_s_base_url, __pyx_v_self->analysis, Py_NE)); if (unlikely(__pyx_t_1 < 0)) __PYX_ERR(0, 489, __pyx_L1_error) + __pyx_t_12 = (__pyx_t_1 != 0); + __pyx_t_2 = __pyx_t_12; + __pyx_L47_bool_binop_done:; + if (__pyx_t_2) { + + /* "gumbocy.pyx":490 + * elif node.v.element.tag == gumbocy.GUMBO_TAG_BASE: + * if attrs.values.get(ATTR_HREF) and "base_url" not in self.analysis: + * self.analysis["base_url"] = attrs.values[ATTR_HREF] # <<<<<<<<<<<<<< + * + * # TODO is_article + */ + if (unlikely(__pyx_v_attrs->values == Py_None)) { + PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable"); + __PYX_ERR(0, 490, __pyx_L1_error) + } + __pyx_t_6 = __Pyx_PyInt_From___pyx_t_7gumbocy_AttributeNames(__pyx_e_7gumbocy_ATTR_HREF); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 490, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_6); + __pyx_t_11 = __Pyx_PyDict_GetItem(__pyx_v_attrs->values, __pyx_t_6); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 490, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_11); + __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; + if (unlikely(__pyx_v_self->analysis == Py_None)) { + PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable"); + __PYX_ERR(0, 490, __pyx_L1_error) + } + if (unlikely(PyDict_SetItem(__pyx_v_self->analysis, __pyx_n_s_base_url, __pyx_t_11) < 0)) __PYX_ERR(0, 490, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_11); __pyx_t_11 = 0; + + /* "gumbocy.pyx":489 + * + * elif node.v.element.tag == gumbocy.GUMBO_TAG_BASE: + * if attrs.values.get(ATTR_HREF) and "base_url" not in self.analysis: # <<<<<<<<<<<<<< + * self.analysis["base_url"] = attrs.values[ATTR_HREF] + * + */ + } + + /* "gumbocy.pyx":488 + * self.analysis["head_metas"][attrs.values[ATTR_PROPERTY]] = str(attrs.values[ATTR_CONTENT]).strip() + * + * elif node.v.element.tag == gumbocy.GUMBO_TAG_BASE: # <<<<<<<<<<<<<< + * if attrs.values.get(ATTR_HREF) and "base_url" not in self.analysis: + * self.analysis["base_url"] = attrs.values[ATTR_HREF] + */ + } + __pyx_L36:; + + /* "gumbocy.pyx":466 + * + * + * if is_head: # <<<<<<<<<<<<<< + * if node.v.element.tag == gumbocy.GUMBO_TAG_LINK: + * + */ + } + + /* "gumbocy.pyx":494 + * # TODO is_article + * + * if not is_hidden: # <<<<<<<<<<<<<< + * is_hidden = self.guess_node_hidden(node, attrs) + * + */ + __pyx_t_2 = ((!(__pyx_v_is_hidden != 0)) != 0); + if (__pyx_t_2) { + + /* "gumbocy.pyx":495 + * + * if not is_hidden: + * is_hidden = self.guess_node_hidden(node, attrs) # <<<<<<<<<<<<<< + * + * if is_boilerplate and not is_boilerplate_bypassed: + */ + __pyx_v_is_hidden = ((struct __pyx_vtabstruct_7gumbocy_HTMLParser *)__pyx_v_self->__pyx_vtab)->guess_node_hidden(__pyx_v_self, __pyx_v_node, __pyx_v_attrs); + + /* "gumbocy.pyx":494 + * # TODO is_article + * + * if not is_hidden: # <<<<<<<<<<<<<< + * is_hidden = self.guess_node_hidden(node, attrs) + * + */ + } + + /* "gumbocy.pyx":497 + * is_hidden = self.guess_node_hidden(node, attrs) + * + * if is_boilerplate and not is_boilerplate_bypassed: # <<<<<<<<<<<<<< + * if self.tags_boilerplate_bypass.count(tag_n): + * is_boilerplate_bypassed = True + */ + __pyx_t_12 = (__pyx_v_is_boilerplate != 0); + if (__pyx_t_12) { + } else { + __pyx_t_2 = __pyx_t_12; + goto __pyx_L51_bool_binop_done; + } + __pyx_t_12 = ((!(__pyx_v_is_boilerplate_bypassed != 0)) != 0); + __pyx_t_2 = __pyx_t_12; + __pyx_L51_bool_binop_done:; + if (__pyx_t_2) { + + /* "gumbocy.pyx":498 + * + * if is_boilerplate and not is_boilerplate_bypassed: + * if self.tags_boilerplate_bypass.count(tag_n): # <<<<<<<<<<<<<< + * is_boilerplate_bypassed = True + * + */ + __pyx_t_2 = (__pyx_v_self->tags_boilerplate_bypass.count(__pyx_v_tag_n) != 0); + if (__pyx_t_2) { + + /* "gumbocy.pyx":499 + * if is_boilerplate and not is_boilerplate_bypassed: + * if self.tags_boilerplate_bypass.count(tag_n): + * is_boilerplate_bypassed = True # <<<<<<<<<<<<<< + * + * if not is_boilerplate: + */ + __pyx_v_is_boilerplate_bypassed = 1; + + /* "gumbocy.pyx":498 + * + * if is_boilerplate and not is_boilerplate_bypassed: + * if self.tags_boilerplate_bypass.count(tag_n): # <<<<<<<<<<<<<< + * is_boilerplate_bypassed = True + * + */ + } + + /* "gumbocy.pyx":497 + * is_hidden = self.guess_node_hidden(node, attrs) + * + * if is_boilerplate and not is_boilerplate_bypassed: # <<<<<<<<<<<<<< + * if self.tags_boilerplate_bypass.count(tag_n): + * is_boilerplate_bypassed = True + */ + } + + /* "gumbocy.pyx":501 + * is_boilerplate_bypassed = True + * + * if not is_boilerplate: # <<<<<<<<<<<<<< + * is_boilerplate = self.guess_node_boilerplate(node, attrs) + * + */ + __pyx_t_2 = ((!(__pyx_v_is_boilerplate != 0)) != 0); + if (__pyx_t_2) { + + /* "gumbocy.pyx":502 + * + * if not is_boilerplate: + * is_boilerplate = self.guess_node_boilerplate(node, attrs) # <<<<<<<<<<<<<< + * + * # print " " * level, "BOILER", tag_name, is_boilerplate, dict(attrs.values), attrs.classes + */ + __pyx_v_is_boilerplate = ((struct __pyx_vtabstruct_7gumbocy_HTMLParser *)__pyx_v_self->__pyx_vtab)->guess_node_boilerplate(__pyx_v_self, __pyx_v_node, __pyx_v_attrs); + + /* "gumbocy.pyx":501 + * is_boilerplate_bypassed = True + * + * if not is_boilerplate: # <<<<<<<<<<<<<< + * is_boilerplate = self.guess_node_boilerplate(node, attrs) + * + */ + } + + /* "gumbocy.pyx":507 + * + * # Close the word group + * if self.tags_separators.count(tag_n): # <<<<<<<<<<<<<< + * self.close_word_group() + * + */ + __pyx_t_2 = (__pyx_v_self->tags_separators.count(__pyx_v_tag_n) != 0); + if (__pyx_t_2) { + + /* "gumbocy.pyx":508 + * # Close the word group + * if self.tags_separators.count(tag_n): + * self.close_word_group() # <<<<<<<<<<<<<< + * + * # Call _traverse_node() recursively for each of the children + */ + ((struct __pyx_vtabstruct_7gumbocy_HTMLParser *)__pyx_v_self->__pyx_vtab)->close_word_group(__pyx_v_self); + + /* "gumbocy.pyx":507 + * + * # Close the word group + * if self.tags_separators.count(tag_n): # <<<<<<<<<<<<<< + * self.close_word_group() + * + */ + } + + /* "gumbocy.pyx":511 + * + * # Call _traverse_node() recursively for each of the children + * for i in range(node.v.element.children.length): # <<<<<<<<<<<<<< + * child = node.v.element.children.data[i] + * if self._traverse_node(level + 1, child, is_head, is_hidden, is_boilerplate, is_boilerplate_bypassed, is_hyperlink) == 1: + */ + __pyx_t_13 = __pyx_v_node->v.element.children.length; + for (__pyx_t_14 = 0; __pyx_t_14 < __pyx_t_13; __pyx_t_14+=1) { + __pyx_v_i = __pyx_t_14; + + /* "gumbocy.pyx":512 + * # Call _traverse_node() recursively for each of the children + * for i in range(node.v.element.children.length): + * child = node.v.element.children.data[i] # <<<<<<<<<<<<<< + * if self._traverse_node(level + 1, child, is_head, is_hidden, is_boilerplate, is_boilerplate_bypassed, is_hyperlink) == 1: + * break + */ + __pyx_v_child = ((GumboNode *)(__pyx_v_node->v.element.children.data[__pyx_v_i])); + + /* "gumbocy.pyx":513 + * for i in range(node.v.element.children.length): + * child = node.v.element.children.data[i] + * if self._traverse_node(level + 1, child, is_head, is_hidden, is_boilerplate, is_boilerplate_bypassed, is_hyperlink) == 1: # <<<<<<<<<<<<<< + * break + * + */ + __pyx_t_2 = ((((struct __pyx_vtabstruct_7gumbocy_HTMLParser *)__pyx_v_self->__pyx_vtab)->_traverse_node(__pyx_v_self, (__pyx_v_level + 1), __pyx_v_child, __pyx_v_is_head, __pyx_v_is_hidden, __pyx_v_is_boilerplate, __pyx_v_is_boilerplate_bypassed, __pyx_v_is_hyperlink) == 1) != 0); + if (__pyx_t_2) { + + /* "gumbocy.pyx":514 + * child = node.v.element.children.data[i] + * if self._traverse_node(level + 1, child, is_head, is_hidden, is_boilerplate, is_boilerplate_bypassed, is_hyperlink) == 1: + * break # <<<<<<<<<<<<<< + * + * # Close the word group + */ + goto __pyx_L57_break; + + /* "gumbocy.pyx":513 + * for i in range(node.v.element.children.length): + * child = node.v.element.children.data[i] + * if self._traverse_node(level + 1, child, is_head, is_hidden, is_boilerplate, is_boilerplate_bypassed, is_hyperlink) == 1: # <<<<<<<<<<<<<< + * break + * + */ + } + } + __pyx_L57_break:; + + /* "gumbocy.pyx":517 + * + * # Close the word group + * if self.tags_separators.count(tag_n): # <<<<<<<<<<<<<< + * self.close_word_group() + * + */ + __pyx_t_2 = (__pyx_v_self->tags_separators.count(__pyx_v_tag_n) != 0); + if (__pyx_t_2) { + + /* "gumbocy.pyx":518 + * # Close the word group + * if self.tags_separators.count(tag_n): + * self.close_word_group() # <<<<<<<<<<<<<< + * + * self.current_stack.pop() + */ + ((struct __pyx_vtabstruct_7gumbocy_HTMLParser *)__pyx_v_self->__pyx_vtab)->close_word_group(__pyx_v_self); + + /* "gumbocy.pyx":517 + * + * # Close the word group + * if self.tags_separators.count(tag_n): # <<<<<<<<<<<<<< + * self.close_word_group() + * + */ + } + + /* "gumbocy.pyx":520 + * self.close_word_group() + * + * self.current_stack.pop() # <<<<<<<<<<<<<< + * + * if node.v.element.tag == gumbocy.GUMBO_TAG_A: + */ + if (unlikely(__pyx_v_self->current_stack == Py_None)) { + PyErr_Format(PyExc_AttributeError, "'NoneType' object has no attribute '%s'", "pop"); + __PYX_ERR(0, 520, __pyx_L1_error) + } + __pyx_t_11 = __Pyx_PyList_Pop(__pyx_v_self->current_stack); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 520, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_11); + __Pyx_DECREF(__pyx_t_11); __pyx_t_11 = 0; + + /* "gumbocy.pyx":522 + * self.current_stack.pop() + * + * if node.v.element.tag == gumbocy.GUMBO_TAG_A: # <<<<<<<<<<<<<< + * self.close_hyperlink() + * + */ + __pyx_t_2 = ((__pyx_v_node->v.element.tag == GUMBO_TAG_A) != 0); + if (__pyx_t_2) { + + /* "gumbocy.pyx":523 + * + * if node.v.element.tag == gumbocy.GUMBO_TAG_A: + * self.close_hyperlink() # <<<<<<<<<<<<<< + * + * if node.v.element.tag == gumbocy.GUMBO_TAG_HEAD: + */ + ((struct __pyx_vtabstruct_7gumbocy_HTMLParser *)__pyx_v_self->__pyx_vtab)->close_hyperlink(__pyx_v_self); + + /* "gumbocy.pyx":522 + * self.current_stack.pop() + * + * if node.v.element.tag == gumbocy.GUMBO_TAG_A: # <<<<<<<<<<<<<< + * self.close_hyperlink() + * + */ + } + + /* "gumbocy.pyx":525 + * self.close_hyperlink() + * + * if node.v.element.tag == gumbocy.GUMBO_TAG_HEAD: # <<<<<<<<<<<<<< + * if self.head_only: + * return 1 + */ + __pyx_t_2 = ((__pyx_v_node->v.element.tag == GUMBO_TAG_HEAD) != 0); + if (__pyx_t_2) { + + /* "gumbocy.pyx":526 + * + * if node.v.element.tag == gumbocy.GUMBO_TAG_HEAD: + * if self.head_only: # <<<<<<<<<<<<<< + * return 1 + * + */ + __pyx_t_2 = (__pyx_v_self->head_only != 0); + if (__pyx_t_2) { + + /* "gumbocy.pyx":527 + * if node.v.element.tag == gumbocy.GUMBO_TAG_HEAD: + * if self.head_only: + * return 1 # <<<<<<<<<<<<<< + * + * return 0 + */ + __pyx_r = 1; + goto __pyx_L0; + + /* "gumbocy.pyx":526 + * + * if node.v.element.tag == gumbocy.GUMBO_TAG_HEAD: + * if self.head_only: # <<<<<<<<<<<<<< + * return 1 + * + */ + } + + /* "gumbocy.pyx":525 + * self.close_hyperlink() + * + * if node.v.element.tag == gumbocy.GUMBO_TAG_HEAD: # <<<<<<<<<<<<<< + * if self.head_only: + * return 1 + */ + } + + /* "gumbocy.pyx":404 + * self.add_text(node.v.text.text) + * + * elif node.type == gumbocy.GUMBO_NODE_ELEMENT: # <<<<<<<<<<<<<< + * + * tag_n = node.v.element.tag + */ + break; + default: break; + } + + /* "gumbocy.pyx":529 + * return 1 + * + * return 0 # <<<<<<<<<<<<<< + * + * def parse(self, char* html): + */ + __pyx_r = 0; + goto __pyx_L0; + + /* "gumbocy.pyx":386 + * self.current_hyperlink = None + * + * cdef bint _traverse_node(self, int level, gumbocy.GumboNode* node, bint is_head, bint is_hidden, bint is_boilerplate, bint is_boilerplate_bypassed, bint is_hyperlink): # <<<<<<<<<<<<<< + * """ Traverses the node tree. Return 1 to stop at this level """ + * + */ + + /* function exit code */ + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_3); + __Pyx_XDECREF(__pyx_t_5); + __Pyx_XDECREF(__pyx_t_6); + __Pyx_XDECREF(__pyx_t_11); + __Pyx_WriteUnraisable("gumbocy.HTMLParser._traverse_node", __pyx_clineno, __pyx_lineno, __pyx_filename, 0, 0); + __pyx_r = 0; + __pyx_L0:; + __Pyx_XDECREF(__pyx_v_py_tag_name); + __Pyx_XDECREF((PyObject *)__pyx_v_attrs); + __Pyx_XDECREF(__pyx_v_v); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "gumbocy.pyx":531 + * return 0 + * + * def parse(self, char* html): # <<<<<<<<<<<<<< + * """ Do the actual parsing of the HTML with gumbo """ + * + */ + +/* Python wrapper */ +static PyObject *__pyx_pw_7gumbocy_10HTMLParser_3parse(PyObject *__pyx_v_self, PyObject *__pyx_arg_html); /*proto*/ +static char __pyx_doc_7gumbocy_10HTMLParser_2parse[] = " Do the actual parsing of the HTML with gumbo "; +static PyObject *__pyx_pw_7gumbocy_10HTMLParser_3parse(PyObject *__pyx_v_self, PyObject *__pyx_arg_html) { + char *__pyx_v_html; + PyObject *__pyx_r = 0; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("parse (wrapper)", 0); + assert(__pyx_arg_html); { + __pyx_v_html = __Pyx_PyObject_AsString(__pyx_arg_html); if (unlikely((!__pyx_v_html) && PyErr_Occurred())) __PYX_ERR(0, 531, __pyx_L3_error) + } + goto __pyx_L4_argument_unpacking_done; + __pyx_L3_error:; + __Pyx_AddTraceback("gumbocy.HTMLParser.parse", __pyx_clineno, __pyx_lineno, __pyx_filename); + __Pyx_RefNannyFinishContext(); + return NULL; + __pyx_L4_argument_unpacking_done:; + __pyx_r = __pyx_pf_7gumbocy_10HTMLParser_2parse(((struct __pyx_obj_7gumbocy_HTMLParser *)__pyx_v_self), ((char *)__pyx_v_html)); + + /* function exit code */ + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static PyObject *__pyx_pf_7gumbocy_10HTMLParser_2parse(struct __pyx_obj_7gumbocy_HTMLParser *__pyx_v_self, char *__pyx_v_html) { + PyObject *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + PyObject *__pyx_t_1 = NULL; + __Pyx_RefNannySetupContext("parse", 0); + + /* "gumbocy.pyx":534 + * """ Do the actual parsing of the HTML with gumbo """ + * + * self.free() # <<<<<<<<<<<<<< + * self.output = gumbocy.gumbo_parse(html) + * self.has_output = 1 + */ + __pyx_t_1 = ((struct __pyx_vtabstruct_7gumbocy_HTMLParser *)__pyx_v_self->__pyx_vtab)->free(__pyx_v_self); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 534, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + + /* "gumbocy.pyx":535 + * + * self.free() + * self.output = gumbocy.gumbo_parse(html) # <<<<<<<<<<<<<< + * self.has_output = 1 + * + */ + __pyx_v_self->output = gumbo_parse(__pyx_v_html); + + /* "gumbocy.pyx":536 + * self.free() + * self.output = gumbocy.gumbo_parse(html) + * self.has_output = 1 # <<<<<<<<<<<<<< + * + * def analyze(self): + */ + __pyx_v_self->has_output = 1; + + /* "gumbocy.pyx":531 + * return 0 + * + * def parse(self, char* html): # <<<<<<<<<<<<<< + * """ Do the actual parsing of the HTML with gumbo """ + * + */ + + /* function exit code */ + __pyx_r = Py_None; __Pyx_INCREF(Py_None); + goto __pyx_L0; + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_1); + __Pyx_AddTraceback("gumbocy.HTMLParser.parse", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = NULL; + __pyx_L0:; + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "gumbocy.pyx":538 + * self.has_output = 1 + * + * def analyze(self): # <<<<<<<<<<<<<< + * """ Traverse the parsed tree and return the results """ + * + */ + +/* Python wrapper */ +static PyObject *__pyx_pw_7gumbocy_10HTMLParser_5analyze(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused); /*proto*/ +static char __pyx_doc_7gumbocy_10HTMLParser_4analyze[] = " Traverse the parsed tree and return the results "; +static PyObject *__pyx_pw_7gumbocy_10HTMLParser_5analyze(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused) { + PyObject *__pyx_r = 0; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("analyze (wrapper)", 0); + __pyx_r = __pyx_pf_7gumbocy_10HTMLParser_4analyze(((struct __pyx_obj_7gumbocy_HTMLParser *)__pyx_v_self)); + + /* function exit code */ + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static PyObject *__pyx_pf_7gumbocy_10HTMLParser_4analyze(struct __pyx_obj_7gumbocy_HTMLParser *__pyx_v_self) { + PyObject *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + PyObject *__pyx_t_1 = NULL; + int __pyx_t_2; + __Pyx_RefNannySetupContext("analyze", 0); + + /* "gumbocy.pyx":541 + * """ Traverse the parsed tree and return the results """ + * + * self.analysis = {} # <<<<<<<<<<<<<< + * + * if self.analyze_internal_hyperlinks: + */ + __pyx_t_1 = PyDict_New(); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 541, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_GIVEREF(__pyx_t_1); + __Pyx_GOTREF(__pyx_v_self->analysis); + __Pyx_DECREF(__pyx_v_self->analysis); + __pyx_v_self->analysis = ((PyObject*)__pyx_t_1); + __pyx_t_1 = 0; + + /* "gumbocy.pyx":543 + * self.analysis = {} + * + * if self.analyze_internal_hyperlinks: # <<<<<<<<<<<<<< + * self.analysis["internal_hyperlinks"] = [] + * + */ + __pyx_t_2 = (__pyx_v_self->analyze_internal_hyperlinks != 0); + if (__pyx_t_2) { + + /* "gumbocy.pyx":544 + * + * if self.analyze_internal_hyperlinks: + * self.analysis["internal_hyperlinks"] = [] # <<<<<<<<<<<<<< + * + * if self.analyze_external_hyperlinks: + */ + __pyx_t_1 = PyList_New(0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 544, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + if (unlikely(__pyx_v_self->analysis == Py_None)) { + PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable"); + __PYX_ERR(0, 544, __pyx_L1_error) + } + if (unlikely(PyDict_SetItem(__pyx_v_self->analysis, __pyx_n_s_internal_hyperlinks, __pyx_t_1) < 0)) __PYX_ERR(0, 544, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + + /* "gumbocy.pyx":543 + * self.analysis = {} + * + * if self.analyze_internal_hyperlinks: # <<<<<<<<<<<<<< + * self.analysis["internal_hyperlinks"] = [] + * + */ + } + + /* "gumbocy.pyx":546 + * self.analysis["internal_hyperlinks"] = [] + * + * if self.analyze_external_hyperlinks: # <<<<<<<<<<<<<< + * self.analysis["external_hyperlinks"] = [] + * + */ + __pyx_t_2 = (__pyx_v_self->analyze_external_hyperlinks != 0); + if (__pyx_t_2) { + + /* "gumbocy.pyx":547 + * + * if self.analyze_external_hyperlinks: + * self.analysis["external_hyperlinks"] = [] # <<<<<<<<<<<<<< + * + * if self.analyze_word_groups: + */ + __pyx_t_1 = PyList_New(0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 547, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + if (unlikely(__pyx_v_self->analysis == Py_None)) { + PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable"); + __PYX_ERR(0, 547, __pyx_L1_error) + } + if (unlikely(PyDict_SetItem(__pyx_v_self->analysis, __pyx_n_s_external_hyperlinks, __pyx_t_1) < 0)) __PYX_ERR(0, 547, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + + /* "gumbocy.pyx":546 + * self.analysis["internal_hyperlinks"] = [] + * + * if self.analyze_external_hyperlinks: # <<<<<<<<<<<<<< + * self.analysis["external_hyperlinks"] = [] + * + */ + } + + /* "gumbocy.pyx":549 + * self.analysis["external_hyperlinks"] = [] + * + * if self.analyze_word_groups: # <<<<<<<<<<<<<< + * self.analysis["word_groups"] = [] + * + */ + __pyx_t_2 = (__pyx_v_self->analyze_word_groups != 0); + if (__pyx_t_2) { + + /* "gumbocy.pyx":550 + * + * if self.analyze_word_groups: + * self.analysis["word_groups"] = [] # <<<<<<<<<<<<<< + * + * self.current_stack = [] + */ + __pyx_t_1 = PyList_New(0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 550, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + if (unlikely(__pyx_v_self->analysis == Py_None)) { + PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable"); + __PYX_ERR(0, 550, __pyx_L1_error) + } + if (unlikely(PyDict_SetItem(__pyx_v_self->analysis, __pyx_n_s_word_groups, __pyx_t_1) < 0)) __PYX_ERR(0, 550, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + + /* "gumbocy.pyx":549 + * self.analysis["external_hyperlinks"] = [] + * + * if self.analyze_word_groups: # <<<<<<<<<<<<<< + * self.analysis["word_groups"] = [] + * + */ + } + + /* "gumbocy.pyx":552 + * self.analysis["word_groups"] = [] + * + * self.current_stack = [] # <<<<<<<<<<<<<< + * self.current_word_group = None + * self.current_hyperlink = None + */ + __pyx_t_1 = PyList_New(0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 552, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_GIVEREF(__pyx_t_1); + __Pyx_GOTREF(__pyx_v_self->current_stack); + __Pyx_DECREF(__pyx_v_self->current_stack); + __pyx_v_self->current_stack = ((PyObject*)__pyx_t_1); + __pyx_t_1 = 0; + + /* "gumbocy.pyx":553 + * + * self.current_stack = [] + * self.current_word_group = None # <<<<<<<<<<<<<< + * self.current_hyperlink = None + * + */ + __Pyx_INCREF(Py_None); + __Pyx_GIVEREF(Py_None); + __Pyx_GOTREF(__pyx_v_self->current_word_group); + __Pyx_DECREF(__pyx_v_self->current_word_group); + __pyx_v_self->current_word_group = Py_None; + + /* "gumbocy.pyx":554 + * self.current_stack = [] + * self.current_word_group = None + * self.current_hyperlink = None # <<<<<<<<<<<<<< + * + * self._traverse_node(0, self.output.root, 0, 0, 0, 0, 0) + */ + __Pyx_INCREF(Py_None); + __Pyx_GIVEREF(Py_None); + __Pyx_GOTREF(__pyx_v_self->current_hyperlink); + __Pyx_DECREF(__pyx_v_self->current_hyperlink); + __pyx_v_self->current_hyperlink = Py_None; + + /* "gumbocy.pyx":556 + * self.current_hyperlink = None + * + * self._traverse_node(0, self.output.root, 0, 0, 0, 0, 0) # <<<<<<<<<<<<<< + * + * return self.analysis + */ + ((struct __pyx_vtabstruct_7gumbocy_HTMLParser *)__pyx_v_self->__pyx_vtab)->_traverse_node(__pyx_v_self, 0, __pyx_v_self->output->root, 0, 0, 0, 0, 0); + + /* "gumbocy.pyx":558 + * self._traverse_node(0, self.output.root, 0, 0, 0, 0, 0) + * + * return self.analysis # <<<<<<<<<<<<<< + * + * # + */ + __Pyx_XDECREF(__pyx_r); + __Pyx_INCREF(__pyx_v_self->analysis); + __pyx_r = __pyx_v_self->analysis; + goto __pyx_L0; + + /* "gumbocy.pyx":538 + * self.has_output = 1 + * + * def analyze(self): # <<<<<<<<<<<<<< + * """ Traverse the parsed tree and return the results """ + * + */ + + /* function exit code */ + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_1); + __Pyx_AddTraceback("gumbocy.HTMLParser.analyze", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = NULL; + __pyx_L0:; + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "gumbocy.pyx":564 + * # + * + * def listnodes(self): # <<<<<<<<<<<<<< + * """ Return the nodes as a flat list of tuples """ + * + */ + +/* Python wrapper */ +static PyObject *__pyx_pw_7gumbocy_10HTMLParser_7listnodes(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused); /*proto*/ +static char __pyx_doc_7gumbocy_10HTMLParser_6listnodes[] = " Return the nodes as a flat list of tuples "; +static PyObject *__pyx_pw_7gumbocy_10HTMLParser_7listnodes(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused) { + PyObject *__pyx_r = 0; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("listnodes (wrapper)", 0); + __pyx_r = __pyx_pf_7gumbocy_10HTMLParser_6listnodes(((struct __pyx_obj_7gumbocy_HTMLParser *)__pyx_v_self)); + + /* function exit code */ + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static PyObject *__pyx_pf_7gumbocy_10HTMLParser_6listnodes(struct __pyx_obj_7gumbocy_HTMLParser *__pyx_v_self) { + PyObject *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + PyObject *__pyx_t_1 = NULL; + __Pyx_RefNannySetupContext("listnodes", 0); + + /* "gumbocy.pyx":567 + * """ Return the nodes as a flat list of tuples """ + * + * self.nodes = [] # <<<<<<<<<<<<<< + * + * self._traverse_node_simple(0, self.output.root) + */ + __pyx_t_1 = PyList_New(0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 567, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_GIVEREF(__pyx_t_1); + __Pyx_GOTREF(__pyx_v_self->nodes); + __Pyx_DECREF(__pyx_v_self->nodes); + __pyx_v_self->nodes = ((PyObject*)__pyx_t_1); + __pyx_t_1 = 0; + + /* "gumbocy.pyx":569 + * self.nodes = [] + * + * self._traverse_node_simple(0, self.output.root) # <<<<<<<<<<<<<< + * + * return self.nodes + */ + ((struct __pyx_vtabstruct_7gumbocy_HTMLParser *)__pyx_v_self->__pyx_vtab)->_traverse_node_simple(__pyx_v_self, 0, __pyx_v_self->output->root); + + /* "gumbocy.pyx":571 + * self._traverse_node_simple(0, self.output.root) + * + * return self.nodes # <<<<<<<<<<<<<< + * + * cdef bint _traverse_node_simple(self, int level, gumbocy.GumboNode* node): + */ + __Pyx_XDECREF(__pyx_r); + __Pyx_INCREF(__pyx_v_self->nodes); + __pyx_r = __pyx_v_self->nodes; + goto __pyx_L0; + + /* "gumbocy.pyx":564 + * # + * + * def listnodes(self): # <<<<<<<<<<<<<< + * """ Return the nodes as a flat list of tuples """ + * + */ + + /* function exit code */ + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_1); + __Pyx_AddTraceback("gumbocy.HTMLParser.listnodes", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = NULL; + __pyx_L0:; + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "gumbocy.pyx":573 + * return self.nodes + * + * cdef bint _traverse_node_simple(self, int level, gumbocy.GumboNode* node): # <<<<<<<<<<<<<< + * """ Traverses the node tree. Return 1 to stop at this level """ + * + */ + +static int __pyx_f_7gumbocy_10HTMLParser__traverse_node_simple(struct __pyx_obj_7gumbocy_HTMLParser *__pyx_v_self, int __pyx_v_level, GumboNode *__pyx_v_node) { + GumboStringPiece __pyx_v_gsp; + int __pyx_v_tag_n; + char const *__pyx_v_tag_name; + PyObject *__pyx_v_py_tag_name = NULL; + int __pyx_v_has_attrs; + PyObject *__pyx_v_attrs = NULL; + unsigned int __pyx_v_i; + GumboAttribute *__pyx_v_attr; + PyObject *__pyx_v_attr_name = NULL; + PyObject *__pyx_v_multiple_value = NULL; + PyObject *__pyx_v_v = NULL; + GumboNode *__pyx_v_child; + int __pyx_r; + __Pyx_RefNannyDeclarations + int __pyx_t_1; + PyObject *__pyx_t_2 = NULL; + PyObject *__pyx_t_3 = NULL; + PyObject *__pyx_t_4 = NULL; + int __pyx_t_5; + int __pyx_t_6; + GumboStringPiece __pyx_t_7; + char const *__pyx_t_8; + unsigned int __pyx_t_9; + unsigned int __pyx_t_10; + char const *__pyx_t_11; + PyObject *__pyx_t_12 = NULL; + PyObject *__pyx_t_13 = NULL; + PyObject *__pyx_t_14 = NULL; + Py_ssize_t __pyx_t_15; + PyObject *(*__pyx_t_16)(PyObject *); + __Pyx_RefNannySetupContext("_traverse_node_simple", 0); + + /* "gumbocy.pyx":578 + * cdef GumboStringPiece gsp + * + * if level > self.nesting_limit: # <<<<<<<<<<<<<< + * return 0 + * + */ + __pyx_t_1 = ((__pyx_v_level > __pyx_v_self->nesting_limit) != 0); + if (__pyx_t_1) { + + /* "gumbocy.pyx":579 + * + * if level > self.nesting_limit: + * return 0 # <<<<<<<<<<<<<< + * + * if node.type == gumbocy.GUMBO_NODE_TEXT: + */ + __pyx_r = 0; + goto __pyx_L0; + + /* "gumbocy.pyx":578 + * cdef GumboStringPiece gsp + * + * if level > self.nesting_limit: # <<<<<<<<<<<<<< + * return 0 + * + */ + } + + /* "gumbocy.pyx":581 + * return 0 + * + * if node.type == gumbocy.GUMBO_NODE_TEXT: # <<<<<<<<<<<<<< + * self.nodes.append((level, None, node.v.text.text)) + * + */ + switch (__pyx_v_node->type) { + case GUMBO_NODE_TEXT: + + /* "gumbocy.pyx":582 + * + * if node.type == gumbocy.GUMBO_NODE_TEXT: + * self.nodes.append((level, None, node.v.text.text)) # <<<<<<<<<<<<<< + * + * elif node.type == gumbocy.GUMBO_NODE_ELEMENT: + */ + if (unlikely(__pyx_v_self->nodes == Py_None)) { + PyErr_Format(PyExc_AttributeError, "'NoneType' object has no attribute '%s'", "append"); + __PYX_ERR(0, 582, __pyx_L1_error) + } + __pyx_t_2 = __Pyx_PyInt_From_int(__pyx_v_level); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 582, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_2); + __pyx_t_3 = __Pyx_PyBytes_FromString(__pyx_v_node->v.text.text); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 582, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __pyx_t_4 = PyTuple_New(3); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 582, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_4); + __Pyx_GIVEREF(__pyx_t_2); + PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_2); + __Pyx_INCREF(Py_None); + __Pyx_GIVEREF(Py_None); + PyTuple_SET_ITEM(__pyx_t_4, 1, Py_None); + __Pyx_GIVEREF(__pyx_t_3); + PyTuple_SET_ITEM(__pyx_t_4, 2, __pyx_t_3); + __pyx_t_2 = 0; + __pyx_t_3 = 0; + __pyx_t_5 = __Pyx_PyList_Append(__pyx_v_self->nodes, __pyx_t_4); if (unlikely(__pyx_t_5 == -1)) __PYX_ERR(0, 582, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; + + /* "gumbocy.pyx":581 + * return 0 + * + * if node.type == gumbocy.GUMBO_NODE_TEXT: # <<<<<<<<<<<<<< + * self.nodes.append((level, None, node.v.text.text)) + * + */ + break; + + /* "gumbocy.pyx":584 + * self.nodes.append((level, None, node.v.text.text)) + * + * elif node.type == gumbocy.GUMBO_NODE_ELEMENT: # <<<<<<<<<<<<<< + * + * tag_n = node.v.element.tag + */ + case GUMBO_NODE_ELEMENT: + + /* "gumbocy.pyx":586 + * elif node.type == gumbocy.GUMBO_NODE_ELEMENT: + * + * tag_n = node.v.element.tag # <<<<<<<<<<<<<< + * + * if self.head_only and self.tags_ignore_head_only.count(tag_n): + */ + __pyx_v_tag_n = ((int)__pyx_v_node->v.element.tag); + + /* "gumbocy.pyx":588 + * tag_n = node.v.element.tag + * + * if self.head_only and self.tags_ignore_head_only.count(tag_n): # <<<<<<<<<<<<<< + * return 1 + * + */ + __pyx_t_6 = (__pyx_v_self->head_only != 0); + if (__pyx_t_6) { + } else { + __pyx_t_1 = __pyx_t_6; + goto __pyx_L5_bool_binop_done; + } + __pyx_t_6 = (__pyx_v_self->tags_ignore_head_only.count(__pyx_v_tag_n) != 0); + __pyx_t_1 = __pyx_t_6; + __pyx_L5_bool_binop_done:; + if (__pyx_t_1) { + + /* "gumbocy.pyx":589 + * + * if self.head_only and self.tags_ignore_head_only.count(tag_n): + * return 1 # <<<<<<<<<<<<<< + * + * if self.tags_ignore.count(tag_n): + */ + __pyx_r = 1; + goto __pyx_L0; + + /* "gumbocy.pyx":588 + * tag_n = node.v.element.tag + * + * if self.head_only and self.tags_ignore_head_only.count(tag_n): # <<<<<<<<<<<<<< + * return 1 + * + */ + } + + /* "gumbocy.pyx":591 + * return 1 + * + * if self.tags_ignore.count(tag_n): # <<<<<<<<<<<<<< + * return 0 + * + */ + __pyx_t_1 = (__pyx_v_self->tags_ignore.count(__pyx_v_tag_n) != 0); + if (__pyx_t_1) { + + /* "gumbocy.pyx":592 + * + * if self.tags_ignore.count(tag_n): + * return 0 # <<<<<<<<<<<<<< + * + * tag_name = gumbocy.gumbo_normalized_tagname(node.v.element.tag) + */ + __pyx_r = 0; + goto __pyx_L0; + + /* "gumbocy.pyx":591 + * return 1 + * + * if self.tags_ignore.count(tag_n): # <<<<<<<<<<<<<< + * return 0 + * + */ + } + + /* "gumbocy.pyx":594 + * return 0 + * + * tag_name = gumbocy.gumbo_normalized_tagname(node.v.element.tag) # <<<<<<<<<<<<<< + * + * # When we find an unknown tag, find its tag_name in the buffer + */ + __pyx_v_tag_name = gumbo_normalized_tagname(__pyx_v_node->v.element.tag); + + /* "gumbocy.pyx":597 + * + * # When we find an unknown tag, find its tag_name in the buffer + * if tag_name == b"": # <<<<<<<<<<<<<< + * gsp = node.v.element.original_tag + * gumbo_tag_from_original_text(&gsp) + */ + __pyx_t_4 = __Pyx_PyBytes_FromString(__pyx_v_tag_name); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 597, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_4); + __pyx_t_1 = (__Pyx_PyBytes_Equals(__pyx_t_4, __pyx_kp_b__5, Py_EQ)); if (unlikely(__pyx_t_1 < 0)) __PYX_ERR(0, 597, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; + if (__pyx_t_1) { + + /* "gumbocy.pyx":598 + * # When we find an unknown tag, find its tag_name in the buffer + * if tag_name == b"": + * gsp = node.v.element.original_tag # <<<<<<<<<<<<<< + * gumbo_tag_from_original_text(&gsp) + * py_tag_name = str(gsp.data)[0:gsp.length].lower() # TODO try to do that only in C! + */ + __pyx_t_7 = __pyx_v_node->v.element.original_tag; + __pyx_v_gsp = __pyx_t_7; + + /* "gumbocy.pyx":599 + * if tag_name == b"": + * gsp = node.v.element.original_tag + * gumbo_tag_from_original_text(&gsp) # <<<<<<<<<<<<<< + * py_tag_name = str(gsp.data)[0:gsp.length].lower() # TODO try to do that only in C! + * tag_name = py_tag_name + */ + gumbo_tag_from_original_text((&__pyx_v_gsp)); + + /* "gumbocy.pyx":600 + * gsp = node.v.element.original_tag + * gumbo_tag_from_original_text(&gsp) + * py_tag_name = str(gsp.data)[0:gsp.length].lower() # TODO try to do that only in C! # <<<<<<<<<<<<<< + * tag_name = py_tag_name + * + */ + __pyx_t_3 = __Pyx_PyBytes_FromString(__pyx_v_gsp.data); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 600, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __pyx_t_2 = PyTuple_New(1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 600, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_2); + __Pyx_GIVEREF(__pyx_t_3); + PyTuple_SET_ITEM(__pyx_t_2, 0, __pyx_t_3); + __pyx_t_3 = 0; + __pyx_t_3 = __Pyx_PyObject_Call(((PyObject *)(&PyString_Type)), __pyx_t_2, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 600, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + __pyx_t_2 = __Pyx_PyObject_GetSlice(__pyx_t_3, 0, __pyx_v_gsp.length, NULL, NULL, NULL, 1, 1, 1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 600, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_2); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_lower); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 600, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + __pyx_t_2 = NULL; + if (CYTHON_COMPILING_IN_CPYTHON && likely(PyMethod_Check(__pyx_t_3))) { + __pyx_t_2 = PyMethod_GET_SELF(__pyx_t_3); + if (likely(__pyx_t_2)) { + PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_3); + __Pyx_INCREF(__pyx_t_2); + __Pyx_INCREF(function); + __Pyx_DECREF_SET(__pyx_t_3, function); + } + } + if (__pyx_t_2) { + __pyx_t_4 = __Pyx_PyObject_CallOneArg(__pyx_t_3, __pyx_t_2); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 600, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + } else { + __pyx_t_4 = __Pyx_PyObject_CallNoArg(__pyx_t_3); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 600, __pyx_L1_error) + } + __Pyx_GOTREF(__pyx_t_4); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __pyx_v_py_tag_name = __pyx_t_4; + __pyx_t_4 = 0; + + /* "gumbocy.pyx":601 + * gumbo_tag_from_original_text(&gsp) + * py_tag_name = str(gsp.data)[0:gsp.length].lower() # TODO try to do that only in C! + * tag_name = py_tag_name # <<<<<<<<<<<<<< + * + * if self.has_attributes_whitelist: + */ + __pyx_t_8 = __Pyx_PyObject_AsString(__pyx_v_py_tag_name); if (unlikely((!__pyx_t_8) && PyErr_Occurred())) __PYX_ERR(0, 601, __pyx_L1_error) + __pyx_v_tag_name = ((char const *)__pyx_t_8); + + /* "gumbocy.pyx":597 + * + * # When we find an unknown tag, find its tag_name in the buffer + * if tag_name == b"": # <<<<<<<<<<<<<< + * gsp = node.v.element.original_tag + * gumbo_tag_from_original_text(&gsp) + */ + } + + /* "gumbocy.pyx":603 + * tag_name = py_tag_name + * + * if self.has_attributes_whitelist: # <<<<<<<<<<<<<< + * + * # Build a dict with all the whitelisted attributes + */ + __pyx_t_1 = (__pyx_v_self->has_attributes_whitelist != 0); + if (__pyx_t_1) { + + /* "gumbocy.pyx":606 + * + * # Build a dict with all the whitelisted attributes + * has_attrs = False # <<<<<<<<<<<<<< + * attrs = False + * for i in range(node.v.element.attributes.length): + */ + __pyx_v_has_attrs = 0; + + /* "gumbocy.pyx":607 + * # Build a dict with all the whitelisted attributes + * has_attrs = False + * attrs = False # <<<<<<<<<<<<<< + * for i in range(node.v.element.attributes.length): + * attr = node.v.element.attributes.data[i] + */ + __Pyx_INCREF(Py_False); + __pyx_v_attrs = Py_False; + + /* "gumbocy.pyx":608 + * has_attrs = False + * attrs = False + * for i in range(node.v.element.attributes.length): # <<<<<<<<<<<<<< + * attr = node.v.element.attributes.data[i] + * attr_name = str(attr.name) + */ + __pyx_t_9 = __pyx_v_node->v.element.attributes.length; + for (__pyx_t_10 = 0; __pyx_t_10 < __pyx_t_9; __pyx_t_10+=1) { + __pyx_v_i = __pyx_t_10; + + /* "gumbocy.pyx":609 + * attrs = False + * for i in range(node.v.element.attributes.length): + * attr = node.v.element.attributes.data[i] # <<<<<<<<<<<<<< + * attr_name = str(attr.name) + * if re2_search(attr_name, deref(self.attributes_whitelist)): + */ + __pyx_v_attr = ((GumboAttribute *)(__pyx_v_node->v.element.attributes.data[__pyx_v_i])); + + /* "gumbocy.pyx":610 + * for i in range(node.v.element.attributes.length): + * attr = node.v.element.attributes.data[i] + * attr_name = str(attr.name) # <<<<<<<<<<<<<< + * if re2_search(attr_name, deref(self.attributes_whitelist)): + * if attr_name == b"class": + */ + __pyx_t_4 = __Pyx_PyBytes_FromString(__pyx_v_attr->name); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 610, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_4); + __pyx_t_3 = PyTuple_New(1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 610, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __Pyx_GIVEREF(__pyx_t_4); + PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_t_4); + __pyx_t_4 = 0; + __pyx_t_4 = __Pyx_PyObject_Call(((PyObject *)(&PyString_Type)), __pyx_t_3, NULL); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 610, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_4); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __Pyx_XDECREF_SET(__pyx_v_attr_name, __pyx_t_4); + __pyx_t_4 = 0; + + /* "gumbocy.pyx":611 + * attr = node.v.element.attributes.data[i] + * attr_name = str(attr.name) + * if re2_search(attr_name, deref(self.attributes_whitelist)): # <<<<<<<<<<<<<< + * if attr_name == b"class": + * multiple_value = frozenset(_RE_SPLIT_WHITESPACE.split(attr.value.strip().lower())) + */ + __pyx_t_11 = __Pyx_PyObject_AsString(__pyx_v_attr_name); if (unlikely((!__pyx_t_11) && PyErr_Occurred())) __PYX_ERR(0, 611, __pyx_L1_error) + __pyx_t_1 = (__pyx_f_7gumbocy_re2_search(__pyx_t_11, (*__pyx_v_self->attributes_whitelist)) != 0); + if (__pyx_t_1) { + + /* "gumbocy.pyx":612 + * attr_name = str(attr.name) + * if re2_search(attr_name, deref(self.attributes_whitelist)): + * if attr_name == b"class": # <<<<<<<<<<<<<< + * multiple_value = frozenset(_RE_SPLIT_WHITESPACE.split(attr.value.strip().lower())) + * if len(multiple_value): + */ + __pyx_t_1 = (__Pyx_PyBytes_Equals(__pyx_v_attr_name, __pyx_n_b_class, Py_EQ)); if (unlikely(__pyx_t_1 < 0)) __PYX_ERR(0, 612, __pyx_L1_error) + if (__pyx_t_1) { + + /* "gumbocy.pyx":613 + * if re2_search(attr_name, deref(self.attributes_whitelist)): + * if attr_name == b"class": + * multiple_value = frozenset(_RE_SPLIT_WHITESPACE.split(attr.value.strip().lower())) # <<<<<<<<<<<<<< + * if len(multiple_value): + * if self.has_classes_ignore: + */ + __pyx_t_3 = __Pyx_GetModuleGlobalName(__pyx_n_s_RE_SPLIT_WHITESPACE); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 613, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_split); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 613, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_2); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __pyx_t_13 = __Pyx_PyBytes_FromString(__pyx_v_attr->value); if (unlikely(!__pyx_t_13)) __PYX_ERR(0, 613, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_13); + __pyx_t_14 = __Pyx_PyObject_GetAttrStr(__pyx_t_13, __pyx_n_s_strip); if (unlikely(!__pyx_t_14)) __PYX_ERR(0, 613, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_14); + __Pyx_DECREF(__pyx_t_13); __pyx_t_13 = 0; + __pyx_t_13 = NULL; + if (CYTHON_COMPILING_IN_CPYTHON && likely(PyMethod_Check(__pyx_t_14))) { + __pyx_t_13 = PyMethod_GET_SELF(__pyx_t_14); + if (likely(__pyx_t_13)) { + PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_14); + __Pyx_INCREF(__pyx_t_13); + __Pyx_INCREF(function); + __Pyx_DECREF_SET(__pyx_t_14, function); + } + } + if (__pyx_t_13) { + __pyx_t_12 = __Pyx_PyObject_CallOneArg(__pyx_t_14, __pyx_t_13); if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 613, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_13); __pyx_t_13 = 0; + } else { + __pyx_t_12 = __Pyx_PyObject_CallNoArg(__pyx_t_14); if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 613, __pyx_L1_error) + } + __Pyx_GOTREF(__pyx_t_12); + __Pyx_DECREF(__pyx_t_14); __pyx_t_14 = 0; + __pyx_t_14 = __Pyx_PyObject_GetAttrStr(__pyx_t_12, __pyx_n_s_lower); if (unlikely(!__pyx_t_14)) __PYX_ERR(0, 613, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_14); + __Pyx_DECREF(__pyx_t_12); __pyx_t_12 = 0; + __pyx_t_12 = NULL; + if (CYTHON_COMPILING_IN_CPYTHON && likely(PyMethod_Check(__pyx_t_14))) { + __pyx_t_12 = PyMethod_GET_SELF(__pyx_t_14); + if (likely(__pyx_t_12)) { + PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_14); + __Pyx_INCREF(__pyx_t_12); + __Pyx_INCREF(function); + __Pyx_DECREF_SET(__pyx_t_14, function); + } + } + if (__pyx_t_12) { + __pyx_t_3 = __Pyx_PyObject_CallOneArg(__pyx_t_14, __pyx_t_12); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 613, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_12); __pyx_t_12 = 0; + } else { + __pyx_t_3 = __Pyx_PyObject_CallNoArg(__pyx_t_14); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 613, __pyx_L1_error) + } + __Pyx_GOTREF(__pyx_t_3); + __Pyx_DECREF(__pyx_t_14); __pyx_t_14 = 0; + __pyx_t_14 = NULL; + if (CYTHON_COMPILING_IN_CPYTHON && unlikely(PyMethod_Check(__pyx_t_2))) { + __pyx_t_14 = PyMethod_GET_SELF(__pyx_t_2); + if (likely(__pyx_t_14)) { + PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_2); + __Pyx_INCREF(__pyx_t_14); + __Pyx_INCREF(function); + __Pyx_DECREF_SET(__pyx_t_2, function); + } + } + if (!__pyx_t_14) { + __pyx_t_4 = __Pyx_PyObject_CallOneArg(__pyx_t_2, __pyx_t_3); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 613, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __Pyx_GOTREF(__pyx_t_4); + } else { + __pyx_t_12 = PyTuple_New(1+1); if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 613, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_12); + __Pyx_GIVEREF(__pyx_t_14); PyTuple_SET_ITEM(__pyx_t_12, 0, __pyx_t_14); __pyx_t_14 = NULL; + __Pyx_GIVEREF(__pyx_t_3); + PyTuple_SET_ITEM(__pyx_t_12, 0+1, __pyx_t_3); + __pyx_t_3 = 0; + __pyx_t_4 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_12, NULL); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 613, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_4); + __Pyx_DECREF(__pyx_t_12); __pyx_t_12 = 0; + } + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + __pyx_t_2 = __Pyx_PyFrozenSet_New(__pyx_t_4); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 613, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_2); + __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; + __Pyx_XDECREF_SET(__pyx_v_multiple_value, ((PyObject*)__pyx_t_2)); + __pyx_t_2 = 0; + + /* "gumbocy.pyx":614 + * if attr_name == b"class": + * multiple_value = frozenset(_RE_SPLIT_WHITESPACE.split(attr.value.strip().lower())) + * if len(multiple_value): # <<<<<<<<<<<<<< + * if self.has_classes_ignore: + * for v in multiple_value: + */ + __pyx_t_15 = PySet_GET_SIZE(__pyx_v_multiple_value); if (unlikely(__pyx_t_15 == -1)) __PYX_ERR(0, 614, __pyx_L1_error) + __pyx_t_1 = (__pyx_t_15 != 0); + if (__pyx_t_1) { + + /* "gumbocy.pyx":615 + * multiple_value = frozenset(_RE_SPLIT_WHITESPACE.split(attr.value.strip().lower())) + * if len(multiple_value): + * if self.has_classes_ignore: # <<<<<<<<<<<<<< + * for v in multiple_value: + * if re2_search(v, deref(self.classes_ignore)): + */ + __pyx_t_1 = (__pyx_v_self->has_classes_ignore != 0); + if (__pyx_t_1) { + + /* "gumbocy.pyx":616 + * if len(multiple_value): + * if self.has_classes_ignore: + * for v in multiple_value: # <<<<<<<<<<<<<< + * if re2_search(v, deref(self.classes_ignore)): + * return 0 + */ + __pyx_t_2 = PyObject_GetIter(__pyx_v_multiple_value); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 616, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_2); + __pyx_t_16 = Py_TYPE(__pyx_t_2)->tp_iternext; if (unlikely(!__pyx_t_16)) __PYX_ERR(0, 616, __pyx_L1_error) + for (;;) { + { + __pyx_t_4 = __pyx_t_16(__pyx_t_2); + if (unlikely(!__pyx_t_4)) { + PyObject* exc_type = PyErr_Occurred(); + if (exc_type) { + if (likely(exc_type == PyExc_StopIteration || PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear(); + else __PYX_ERR(0, 616, __pyx_L1_error) + } + break; + } + __Pyx_GOTREF(__pyx_t_4); + } + __Pyx_XDECREF_SET(__pyx_v_v, __pyx_t_4); + __pyx_t_4 = 0; + + /* "gumbocy.pyx":617 + * if self.has_classes_ignore: + * for v in multiple_value: + * if re2_search(v, deref(self.classes_ignore)): # <<<<<<<<<<<<<< + * return 0 + * + */ + __pyx_t_11 = __Pyx_PyObject_AsString(__pyx_v_v); if (unlikely((!__pyx_t_11) && PyErr_Occurred())) __PYX_ERR(0, 617, __pyx_L1_error) + __pyx_t_1 = (__pyx_f_7gumbocy_re2_search(__pyx_t_11, (*__pyx_v_self->classes_ignore)) != 0); + if (__pyx_t_1) { + + /* "gumbocy.pyx":618 + * for v in multiple_value: + * if re2_search(v, deref(self.classes_ignore)): + * return 0 # <<<<<<<<<<<<<< + * + * if not has_attrs: + */ + __pyx_r = 0; + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + goto __pyx_L0; + + /* "gumbocy.pyx":617 + * if self.has_classes_ignore: + * for v in multiple_value: + * if re2_search(v, deref(self.classes_ignore)): # <<<<<<<<<<<<<< + * return 0 + * + */ + } + + /* "gumbocy.pyx":616 + * if len(multiple_value): + * if self.has_classes_ignore: + * for v in multiple_value: # <<<<<<<<<<<<<< + * if re2_search(v, deref(self.classes_ignore)): + * return 0 + */ + } + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + + /* "gumbocy.pyx":615 + * multiple_value = frozenset(_RE_SPLIT_WHITESPACE.split(attr.value.strip().lower())) + * if len(multiple_value): + * if self.has_classes_ignore: # <<<<<<<<<<<<<< + * for v in multiple_value: + * if re2_search(v, deref(self.classes_ignore)): + */ + } + + /* "gumbocy.pyx":620 + * return 0 + * + * if not has_attrs: # <<<<<<<<<<<<<< + * attrs = {} + * has_attrs = True + */ + __pyx_t_1 = ((!(__pyx_v_has_attrs != 0)) != 0); + if (__pyx_t_1) { + + /* "gumbocy.pyx":621 + * + * if not has_attrs: + * attrs = {} # <<<<<<<<<<<<<< + * has_attrs = True + * attrs[attr_name] = multiple_value + */ + __pyx_t_2 = PyDict_New(); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 621, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_2); + __Pyx_DECREF_SET(__pyx_v_attrs, __pyx_t_2); + __pyx_t_2 = 0; + + /* "gumbocy.pyx":622 + * if not has_attrs: + * attrs = {} + * has_attrs = True # <<<<<<<<<<<<<< + * attrs[attr_name] = multiple_value + * + */ + __pyx_v_has_attrs = 1; + + /* "gumbocy.pyx":620 + * return 0 + * + * if not has_attrs: # <<<<<<<<<<<<<< + * attrs = {} + * has_attrs = True + */ + } + + /* "gumbocy.pyx":623 + * attrs = {} + * has_attrs = True + * attrs[attr_name] = multiple_value # <<<<<<<<<<<<<< + * + * else: + */ + if (unlikely(PyObject_SetItem(__pyx_v_attrs, __pyx_v_attr_name, __pyx_v_multiple_value) < 0)) __PYX_ERR(0, 623, __pyx_L1_error) + + /* "gumbocy.pyx":614 + * if attr_name == b"class": + * multiple_value = frozenset(_RE_SPLIT_WHITESPACE.split(attr.value.strip().lower())) + * if len(multiple_value): # <<<<<<<<<<<<<< + * if self.has_classes_ignore: + * for v in multiple_value: */ - if (unlikely(__pyx_v_options == Py_None)) { - PyErr_Format(PyExc_AttributeError, "'NoneType' object has no attribute '%s'", "get"); - __PYX_ERR(0, 128, __pyx_L1_error) - } - __pyx_t_1 = __Pyx_PyDict_GetItemDefault(__pyx_v_options, __pyx_n_s_head_only, Py_None); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 128, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_1); - __pyx_t_2 = __Pyx_PyObject_IsTrue(__pyx_t_1); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 128, __pyx_L1_error) - __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - __pyx_v_self->head_only = __pyx_t_2; + } - /* "gumbocy.pyx":130 - * self.head_only = options.get("head_only") - * - * self.has_classes_ignore = options.get("classes_ignore") # <<<<<<<<<<<<<< - * if self.has_classes_ignore: - * self.classes_ignore = frozenset(options["classes_ignore"]) + /* "gumbocy.pyx":612 + * attr_name = str(attr.name) + * if re2_search(attr_name, deref(self.attributes_whitelist)): + * if attr_name == b"class": # <<<<<<<<<<<<<< + * multiple_value = frozenset(_RE_SPLIT_WHITESPACE.split(attr.value.strip().lower())) + * if len(multiple_value): */ - if (unlikely(__pyx_v_options == Py_None)) { - PyErr_Format(PyExc_AttributeError, "'NoneType' object has no attribute '%s'", "get"); - __PYX_ERR(0, 130, __pyx_L1_error) - } - __pyx_t_1 = __Pyx_PyDict_GetItemDefault(__pyx_v_options, __pyx_n_s_classes_ignore, Py_None); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 130, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_1); - __pyx_t_2 = __Pyx_PyObject_IsTrue(__pyx_t_1); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 130, __pyx_L1_error) - __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - __pyx_v_self->has_classes_ignore = __pyx_t_2; + goto __pyx_L13; + } - /* "gumbocy.pyx":131 - * - * self.has_classes_ignore = options.get("classes_ignore") - * if self.has_classes_ignore: # <<<<<<<<<<<<<< - * self.classes_ignore = frozenset(options["classes_ignore"]) + /* "gumbocy.pyx":627 + * else: * + * if not has_attrs: # <<<<<<<<<<<<<< + * attrs = {} + * has_attrs = True */ - __pyx_t_2 = (__pyx_v_self->has_classes_ignore != 0); - if (__pyx_t_2) { + /*else*/ { + __pyx_t_1 = ((!(__pyx_v_has_attrs != 0)) != 0); + if (__pyx_t_1) { - /* "gumbocy.pyx":132 - * self.has_classes_ignore = options.get("classes_ignore") - * if self.has_classes_ignore: - * self.classes_ignore = frozenset(options["classes_ignore"]) # <<<<<<<<<<<<<< + /* "gumbocy.pyx":628 * - * self.has_ids_ignore = options.get("ids_ignore") + * if not has_attrs: + * attrs = {} # <<<<<<<<<<<<<< + * has_attrs = True + * attrs[attr_name] = attr.value */ - if (unlikely(__pyx_v_options == Py_None)) { - PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable"); - __PYX_ERR(0, 132, __pyx_L1_error) - } - __pyx_t_1 = __Pyx_PyDict_GetItem(__pyx_v_options, __pyx_n_s_classes_ignore); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 132, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_1); - __pyx_t_3 = __Pyx_PyFrozenSet_New(__pyx_t_1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 132, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_3); - __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - __Pyx_GIVEREF(__pyx_t_3); - __Pyx_GOTREF(__pyx_v_self->classes_ignore); - __Pyx_DECREF(__pyx_v_self->classes_ignore); - __pyx_v_self->classes_ignore = ((PyObject*)__pyx_t_3); - __pyx_t_3 = 0; + __pyx_t_2 = PyDict_New(); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 628, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_2); + __Pyx_DECREF_SET(__pyx_v_attrs, __pyx_t_2); + __pyx_t_2 = 0; - /* "gumbocy.pyx":131 - * - * self.has_classes_ignore = options.get("classes_ignore") - * if self.has_classes_ignore: # <<<<<<<<<<<<<< - * self.classes_ignore = frozenset(options["classes_ignore"]) + /* "gumbocy.pyx":629 + * if not has_attrs: + * attrs = {} + * has_attrs = True # <<<<<<<<<<<<<< + * attrs[attr_name] = attr.value * */ - } + __pyx_v_has_attrs = 1; - /* "gumbocy.pyx":134 - * self.classes_ignore = frozenset(options["classes_ignore"]) + /* "gumbocy.pyx":627 + * else: * - * self.has_ids_ignore = options.get("ids_ignore") # <<<<<<<<<<<<<< - * if self.has_ids_ignore: - * self.ids_ignore = frozenset(options["ids_ignore"]) + * if not has_attrs: # <<<<<<<<<<<<<< + * attrs = {} + * has_attrs = True */ - if (unlikely(__pyx_v_options == Py_None)) { - PyErr_Format(PyExc_AttributeError, "'NoneType' object has no attribute '%s'", "get"); - __PYX_ERR(0, 134, __pyx_L1_error) - } - __pyx_t_3 = __Pyx_PyDict_GetItemDefault(__pyx_v_options, __pyx_n_s_ids_ignore, Py_None); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 134, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_3); - __pyx_t_2 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 134, __pyx_L1_error) - __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - __pyx_v_self->has_ids_ignore = __pyx_t_2; + } - /* "gumbocy.pyx":135 - * - * self.has_ids_ignore = options.get("ids_ignore") - * if self.has_ids_ignore: # <<<<<<<<<<<<<< - * self.ids_ignore = frozenset(options["ids_ignore"]) + /* "gumbocy.pyx":630 + * attrs = {} + * has_attrs = True + * attrs[attr_name] = attr.value # <<<<<<<<<<<<<< * + * if not has_attrs: */ - __pyx_t_2 = (__pyx_v_self->has_ids_ignore != 0); - if (__pyx_t_2) { + __pyx_t_2 = __Pyx_PyBytes_FromString(__pyx_v_attr->value); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 630, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_2); + if (unlikely(PyObject_SetItem(__pyx_v_attrs, __pyx_v_attr_name, __pyx_t_2) < 0)) __PYX_ERR(0, 630, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + } + __pyx_L13:; - /* "gumbocy.pyx":136 - * self.has_ids_ignore = options.get("ids_ignore") - * if self.has_ids_ignore: - * self.ids_ignore = frozenset(options["ids_ignore"]) # <<<<<<<<<<<<<< - * - * self.has_attributes_whitelist = options.get("attributes_whitelist") + /* "gumbocy.pyx":611 + * attr = node.v.element.attributes.data[i] + * attr_name = str(attr.name) + * if re2_search(attr_name, deref(self.attributes_whitelist)): # <<<<<<<<<<<<<< + * if attr_name == b"class": + * multiple_value = frozenset(_RE_SPLIT_WHITESPACE.split(attr.value.strip().lower())) */ - if (unlikely(__pyx_v_options == Py_None)) { - PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable"); - __PYX_ERR(0, 136, __pyx_L1_error) - } - __pyx_t_3 = __Pyx_PyDict_GetItem(__pyx_v_options, __pyx_n_s_ids_ignore); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 136, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_3); - __pyx_t_1 = __Pyx_PyFrozenSet_New(__pyx_t_3); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 136, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_1); - __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - __Pyx_GIVEREF(__pyx_t_1); - __Pyx_GOTREF(__pyx_v_self->ids_ignore); - __Pyx_DECREF(__pyx_v_self->ids_ignore); - __pyx_v_self->ids_ignore = ((PyObject*)__pyx_t_1); - __pyx_t_1 = 0; + } + } - /* "gumbocy.pyx":135 + /* "gumbocy.pyx":632 + * attrs[attr_name] = attr.value * - * self.has_ids_ignore = options.get("ids_ignore") - * if self.has_ids_ignore: # <<<<<<<<<<<<<< - * self.ids_ignore = frozenset(options["ids_ignore"]) + * if not has_attrs: # <<<<<<<<<<<<<< + * self.nodes.append((level, tag_name)) * */ - } + __pyx_t_1 = ((!(__pyx_v_has_attrs != 0)) != 0); + if (__pyx_t_1) { - /* "gumbocy.pyx":138 - * self.ids_ignore = frozenset(options["ids_ignore"]) + /* "gumbocy.pyx":633 * - * self.has_attributes_whitelist = options.get("attributes_whitelist") # <<<<<<<<<<<<<< - * if self.has_attributes_whitelist: - * self.attributes_whitelist = frozenset(options.get("attributes_whitelist") or []) + * if not has_attrs: + * self.nodes.append((level, tag_name)) # <<<<<<<<<<<<<< + * + * else: */ - if (unlikely(__pyx_v_options == Py_None)) { - PyErr_Format(PyExc_AttributeError, "'NoneType' object has no attribute '%s'", "get"); - __PYX_ERR(0, 138, __pyx_L1_error) - } - __pyx_t_1 = __Pyx_PyDict_GetItemDefault(__pyx_v_options, __pyx_n_s_attributes_whitelist, Py_None); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 138, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_1); - __pyx_t_2 = __Pyx_PyObject_IsTrue(__pyx_t_1); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 138, __pyx_L1_error) - __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - __pyx_v_self->has_attributes_whitelist = __pyx_t_2; + if (unlikely(__pyx_v_self->nodes == Py_None)) { + PyErr_Format(PyExc_AttributeError, "'NoneType' object has no attribute '%s'", "append"); + __PYX_ERR(0, 633, __pyx_L1_error) + } + __pyx_t_2 = __Pyx_PyInt_From_int(__pyx_v_level); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 633, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_2); + __pyx_t_4 = __Pyx_PyBytes_FromString(__pyx_v_tag_name); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 633, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_4); + __pyx_t_12 = PyTuple_New(2); if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 633, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_12); + __Pyx_GIVEREF(__pyx_t_2); + PyTuple_SET_ITEM(__pyx_t_12, 0, __pyx_t_2); + __Pyx_GIVEREF(__pyx_t_4); + PyTuple_SET_ITEM(__pyx_t_12, 1, __pyx_t_4); + __pyx_t_2 = 0; + __pyx_t_4 = 0; + __pyx_t_5 = __Pyx_PyList_Append(__pyx_v_self->nodes, __pyx_t_12); if (unlikely(__pyx_t_5 == -1)) __PYX_ERR(0, 633, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_12); __pyx_t_12 = 0; - /* "gumbocy.pyx":139 + /* "gumbocy.pyx":632 + * attrs[attr_name] = attr.value * - * self.has_attributes_whitelist = options.get("attributes_whitelist") - * if self.has_attributes_whitelist: # <<<<<<<<<<<<<< - * self.attributes_whitelist = frozenset(options.get("attributes_whitelist") or []) + * if not has_attrs: # <<<<<<<<<<<<<< + * self.nodes.append((level, tag_name)) * */ - __pyx_t_2 = (__pyx_v_self->has_attributes_whitelist != 0); - if (__pyx_t_2) { + goto __pyx_L21; + } - /* "gumbocy.pyx":140 - * self.has_attributes_whitelist = options.get("attributes_whitelist") - * if self.has_attributes_whitelist: - * self.attributes_whitelist = frozenset(options.get("attributes_whitelist") or []) # <<<<<<<<<<<<<< + /* "gumbocy.pyx":637 + * else: * - * self.tags_ignore_head_only.insert(gumbocy.GUMBO_TAG_BODY) + * if self.has_ids_ignore: # <<<<<<<<<<<<<< + * if attrs.get("id") and re2_search(attrs["id"].lower(), deref(self.ids_ignore)): + * return 0 */ - if (unlikely(__pyx_v_options == Py_None)) { - PyErr_Format(PyExc_AttributeError, "'NoneType' object has no attribute '%s'", "get"); - __PYX_ERR(0, 140, __pyx_L1_error) - } - __pyx_t_3 = __Pyx_PyDict_GetItemDefault(__pyx_v_options, __pyx_n_s_attributes_whitelist, Py_None); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 140, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_3); - __pyx_t_2 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_2 < 0)) __PYX_ERR(0, 140, __pyx_L1_error) - if (!__pyx_t_2) { - __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - } else { - __Pyx_INCREF(__pyx_t_3); - __pyx_t_1 = __pyx_t_3; - __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - goto __pyx_L8_bool_binop_done; - } - __pyx_t_3 = PyList_New(0); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 140, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_3); - __Pyx_INCREF(__pyx_t_3); - __pyx_t_1 = __pyx_t_3; - __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - __pyx_L8_bool_binop_done:; - __pyx_t_3 = __Pyx_PyFrozenSet_New(__pyx_t_1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 140, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_3); - __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - __Pyx_GIVEREF(__pyx_t_3); - __Pyx_GOTREF(__pyx_v_self->attributes_whitelist); - __Pyx_DECREF(__pyx_v_self->attributes_whitelist); - __pyx_v_self->attributes_whitelist = ((PyObject*)__pyx_t_3); - __pyx_t_3 = 0; + /*else*/ { + __pyx_t_1 = (__pyx_v_self->has_ids_ignore != 0); + if (__pyx_t_1) { - /* "gumbocy.pyx":139 + /* "gumbocy.pyx":638 * - * self.has_attributes_whitelist = options.get("attributes_whitelist") - * if self.has_attributes_whitelist: # <<<<<<<<<<<<<< - * self.attributes_whitelist = frozenset(options.get("attributes_whitelist") or []) + * if self.has_ids_ignore: + * if attrs.get("id") and re2_search(attrs["id"].lower(), deref(self.ids_ignore)): # <<<<<<<<<<<<<< + * return 0 * */ - } + __pyx_t_12 = __Pyx_PyObject_GetAttrStr(__pyx_v_attrs, __pyx_n_s_get); if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 638, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_12); + __pyx_t_4 = __Pyx_PyObject_Call(__pyx_t_12, __pyx_tuple__10, NULL); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 638, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_4); + __Pyx_DECREF(__pyx_t_12); __pyx_t_12 = 0; + __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(0, 638, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; + if (__pyx_t_6) { + } else { + __pyx_t_1 = __pyx_t_6; + goto __pyx_L24_bool_binop_done; + } + __pyx_t_12 = PyObject_GetItem(__pyx_v_attrs, __pyx_n_s_id); if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 638, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_12); + __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_12, __pyx_n_s_lower); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 638, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_2); + __Pyx_DECREF(__pyx_t_12); __pyx_t_12 = 0; + __pyx_t_12 = NULL; + if (CYTHON_COMPILING_IN_CPYTHON && likely(PyMethod_Check(__pyx_t_2))) { + __pyx_t_12 = PyMethod_GET_SELF(__pyx_t_2); + if (likely(__pyx_t_12)) { + PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_2); + __Pyx_INCREF(__pyx_t_12); + __Pyx_INCREF(function); + __Pyx_DECREF_SET(__pyx_t_2, function); + } + } + if (__pyx_t_12) { + __pyx_t_4 = __Pyx_PyObject_CallOneArg(__pyx_t_2, __pyx_t_12); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 638, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_12); __pyx_t_12 = 0; + } else { + __pyx_t_4 = __Pyx_PyObject_CallNoArg(__pyx_t_2); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 638, __pyx_L1_error) + } + __Pyx_GOTREF(__pyx_t_4); + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + __pyx_t_11 = __Pyx_PyObject_AsString(__pyx_t_4); if (unlikely((!__pyx_t_11) && PyErr_Occurred())) __PYX_ERR(0, 638, __pyx_L1_error) + __pyx_t_6 = (__pyx_f_7gumbocy_re2_search(__pyx_t_11, (*__pyx_v_self->ids_ignore)) != 0); + __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; + __pyx_t_1 = __pyx_t_6; + __pyx_L24_bool_binop_done:; + if (__pyx_t_1) { - /* "gumbocy.pyx":142 - * self.attributes_whitelist = frozenset(options.get("attributes_whitelist") or []) + /* "gumbocy.pyx":639 + * if self.has_ids_ignore: + * if attrs.get("id") and re2_search(attrs["id"].lower(), deref(self.ids_ignore)): + * return 0 # <<<<<<<<<<<<<< * - * self.tags_ignore_head_only.insert(gumbocy.GUMBO_TAG_BODY) # <<<<<<<<<<<<<< - * self.tags_ignore_head_only.insert(gumbocy.GUMBO_TAG_P) - * self.tags_ignore_head_only.insert(gumbocy.GUMBO_TAG_DIV) + * self.nodes.append((level, tag_name, attrs)) */ - __pyx_v_self->tags_ignore_head_only.insert(GUMBO_TAG_BODY); + __pyx_r = 0; + goto __pyx_L0; - /* "gumbocy.pyx":143 + /* "gumbocy.pyx":638 * - * self.tags_ignore_head_only.insert(gumbocy.GUMBO_TAG_BODY) - * self.tags_ignore_head_only.insert(gumbocy.GUMBO_TAG_P) # <<<<<<<<<<<<<< - * self.tags_ignore_head_only.insert(gumbocy.GUMBO_TAG_DIV) - * self.tags_ignore_head_only.insert(gumbocy.GUMBO_TAG_SPAN) - */ - __pyx_v_self->tags_ignore_head_only.insert(GUMBO_TAG_P); - - /* "gumbocy.pyx":144 - * self.tags_ignore_head_only.insert(gumbocy.GUMBO_TAG_BODY) - * self.tags_ignore_head_only.insert(gumbocy.GUMBO_TAG_P) - * self.tags_ignore_head_only.insert(gumbocy.GUMBO_TAG_DIV) # <<<<<<<<<<<<<< - * self.tags_ignore_head_only.insert(gumbocy.GUMBO_TAG_SPAN) + * if self.has_ids_ignore: + * if attrs.get("id") and re2_search(attrs["id"].lower(), deref(self.ids_ignore)): # <<<<<<<<<<<<<< + * return 0 * */ - __pyx_v_self->tags_ignore_head_only.insert(GUMBO_TAG_DIV); + } - /* "gumbocy.pyx":145 - * self.tags_ignore_head_only.insert(gumbocy.GUMBO_TAG_P) - * self.tags_ignore_head_only.insert(gumbocy.GUMBO_TAG_DIV) - * self.tags_ignore_head_only.insert(gumbocy.GUMBO_TAG_SPAN) # <<<<<<<<<<<<<< + /* "gumbocy.pyx":637 + * else: * - * for tag_name in options.get("tags_ignore", []): + * if self.has_ids_ignore: # <<<<<<<<<<<<<< + * if attrs.get("id") and re2_search(attrs["id"].lower(), deref(self.ids_ignore)): + * return 0 */ - __pyx_v_self->tags_ignore_head_only.insert(GUMBO_TAG_SPAN); + } - /* "gumbocy.pyx":147 - * self.tags_ignore_head_only.insert(gumbocy.GUMBO_TAG_SPAN) + /* "gumbocy.pyx":641 + * return 0 * - * for tag_name in options.get("tags_ignore", []): # <<<<<<<<<<<<<< - * tag = gumbocy.gumbo_tag_enum(tag_name) - * if tag != gumbocy.GUMBO_TAG_UNKNOWN: + * self.nodes.append((level, tag_name, attrs)) # <<<<<<<<<<<<<< + * + * else: */ - if (unlikely(__pyx_v_options == Py_None)) { - PyErr_Format(PyExc_AttributeError, "'NoneType' object has no attribute '%s'", "get"); - __PYX_ERR(0, 147, __pyx_L1_error) - } - __pyx_t_3 = PyList_New(0); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 147, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_3); - __pyx_t_1 = __Pyx_PyDict_GetItemDefault(__pyx_v_options, __pyx_n_s_tags_ignore, __pyx_t_3); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 147, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_1); - __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - if (likely(PyList_CheckExact(__pyx_t_1)) || PyTuple_CheckExact(__pyx_t_1)) { - __pyx_t_3 = __pyx_t_1; __Pyx_INCREF(__pyx_t_3); __pyx_t_5 = 0; - __pyx_t_6 = NULL; - } else { - __pyx_t_5 = -1; __pyx_t_3 = PyObject_GetIter(__pyx_t_1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 147, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_3); - __pyx_t_6 = Py_TYPE(__pyx_t_3)->tp_iternext; if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 147, __pyx_L1_error) - } - __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - for (;;) { - if (likely(!__pyx_t_6)) { - if (likely(PyList_CheckExact(__pyx_t_3))) { - if (__pyx_t_5 >= PyList_GET_SIZE(__pyx_t_3)) break; - #if CYTHON_COMPILING_IN_CPYTHON - __pyx_t_1 = PyList_GET_ITEM(__pyx_t_3, __pyx_t_5); __Pyx_INCREF(__pyx_t_1); __pyx_t_5++; if (unlikely(0 < 0)) __PYX_ERR(0, 147, __pyx_L1_error) - #else - __pyx_t_1 = PySequence_ITEM(__pyx_t_3, __pyx_t_5); __pyx_t_5++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 147, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_1); - #endif - } else { - if (__pyx_t_5 >= PyTuple_GET_SIZE(__pyx_t_3)) break; - #if CYTHON_COMPILING_IN_CPYTHON - __pyx_t_1 = PyTuple_GET_ITEM(__pyx_t_3, __pyx_t_5); __Pyx_INCREF(__pyx_t_1); __pyx_t_5++; if (unlikely(0 < 0)) __PYX_ERR(0, 147, __pyx_L1_error) - #else - __pyx_t_1 = PySequence_ITEM(__pyx_t_3, __pyx_t_5); __pyx_t_5++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 147, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_1); - #endif + if (unlikely(__pyx_v_self->nodes == Py_None)) { + PyErr_Format(PyExc_AttributeError, "'NoneType' object has no attribute '%s'", "append"); + __PYX_ERR(0, 641, __pyx_L1_error) + } + __pyx_t_4 = __Pyx_PyInt_From_int(__pyx_v_level); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 641, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_4); + __pyx_t_2 = __Pyx_PyBytes_FromString(__pyx_v_tag_name); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 641, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_2); + __pyx_t_12 = PyTuple_New(3); if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 641, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_12); + __Pyx_GIVEREF(__pyx_t_4); + PyTuple_SET_ITEM(__pyx_t_12, 0, __pyx_t_4); + __Pyx_GIVEREF(__pyx_t_2); + PyTuple_SET_ITEM(__pyx_t_12, 1, __pyx_t_2); + __Pyx_INCREF(__pyx_v_attrs); + __Pyx_GIVEREF(__pyx_v_attrs); + PyTuple_SET_ITEM(__pyx_t_12, 2, __pyx_v_attrs); + __pyx_t_4 = 0; + __pyx_t_2 = 0; + __pyx_t_5 = __Pyx_PyList_Append(__pyx_v_self->nodes, __pyx_t_12); if (unlikely(__pyx_t_5 == -1)) __PYX_ERR(0, 641, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_12); __pyx_t_12 = 0; } - } else { - __pyx_t_1 = __pyx_t_6(__pyx_t_3); - if (unlikely(!__pyx_t_1)) { - PyObject* exc_type = PyErr_Occurred(); - if (exc_type) { - if (likely(exc_type == PyExc_StopIteration || PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear(); - else __PYX_ERR(0, 147, __pyx_L1_error) - } - break; + __pyx_L21:; + + /* "gumbocy.pyx":603 + * tag_name = py_tag_name + * + * if self.has_attributes_whitelist: # <<<<<<<<<<<<<< + * + * # Build a dict with all the whitelisted attributes + */ + goto __pyx_L9; + } + + /* "gumbocy.pyx":644 + * + * else: + * self.nodes.append((level, tag_name)) # <<<<<<<<<<<<<< + * + * # Call _iternode() recursively for each of the children + */ + /*else*/ { + if (unlikely(__pyx_v_self->nodes == Py_None)) { + PyErr_Format(PyExc_AttributeError, "'NoneType' object has no attribute '%s'", "append"); + __PYX_ERR(0, 644, __pyx_L1_error) } - __Pyx_GOTREF(__pyx_t_1); + __pyx_t_12 = __Pyx_PyInt_From_int(__pyx_v_level); if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 644, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_12); + __pyx_t_2 = __Pyx_PyBytes_FromString(__pyx_v_tag_name); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 644, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_2); + __pyx_t_4 = PyTuple_New(2); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 644, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_4); + __Pyx_GIVEREF(__pyx_t_12); + PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_12); + __Pyx_GIVEREF(__pyx_t_2); + PyTuple_SET_ITEM(__pyx_t_4, 1, __pyx_t_2); + __pyx_t_12 = 0; + __pyx_t_2 = 0; + __pyx_t_5 = __Pyx_PyList_Append(__pyx_v_self->nodes, __pyx_t_4); if (unlikely(__pyx_t_5 == -1)) __PYX_ERR(0, 644, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; } - __Pyx_XDECREF_SET(__pyx_v_tag_name, __pyx_t_1); - __pyx_t_1 = 0; + __pyx_L9:; - /* "gumbocy.pyx":148 + /* "gumbocy.pyx":647 * - * for tag_name in options.get("tags_ignore", []): - * tag = gumbocy.gumbo_tag_enum(tag_name) # <<<<<<<<<<<<<< - * if tag != gumbocy.GUMBO_TAG_UNKNOWN: - * self.tags_ignore.insert( gumbocy.gumbo_tag_enum(tag_name)) + * # Call _iternode() recursively for each of the children + * for i in range(node.v.element.children.length): # <<<<<<<<<<<<<< + * child = node.v.element.children.data[i] + * if self._traverse_node_simple(level + 1, child) == 1: + */ + __pyx_t_9 = __pyx_v_node->v.element.children.length; + for (__pyx_t_10 = 0; __pyx_t_10 < __pyx_t_9; __pyx_t_10+=1) { + __pyx_v_i = __pyx_t_10; + + /* "gumbocy.pyx":648 + * # Call _iternode() recursively for each of the children + * for i in range(node.v.element.children.length): + * child = node.v.element.children.data[i] # <<<<<<<<<<<<<< + * if self._traverse_node_simple(level + 1, child) == 1: + * break */ - __pyx_t_7 = __Pyx_PyObject_AsString(__pyx_v_tag_name); if (unlikely((!__pyx_t_7) && PyErr_Occurred())) __PYX_ERR(0, 148, __pyx_L1_error) - __pyx_v_tag = gumbo_tag_enum(__pyx_t_7); + __pyx_v_child = ((GumboNode *)(__pyx_v_node->v.element.children.data[__pyx_v_i])); - /* "gumbocy.pyx":149 - * for tag_name in options.get("tags_ignore", []): - * tag = gumbocy.gumbo_tag_enum(tag_name) - * if tag != gumbocy.GUMBO_TAG_UNKNOWN: # <<<<<<<<<<<<<< - * self.tags_ignore.insert( gumbocy.gumbo_tag_enum(tag_name)) + /* "gumbocy.pyx":649 + * for i in range(node.v.element.children.length): + * child = node.v.element.children.data[i] + * if self._traverse_node_simple(level + 1, child) == 1: # <<<<<<<<<<<<<< + * break * */ - __pyx_t_2 = ((__pyx_v_tag != GUMBO_TAG_UNKNOWN) != 0); - if (__pyx_t_2) { + __pyx_t_1 = ((((struct __pyx_vtabstruct_7gumbocy_HTMLParser *)__pyx_v_self->__pyx_vtab)->_traverse_node_simple(__pyx_v_self, (__pyx_v_level + 1), __pyx_v_child) == 1) != 0); + if (__pyx_t_1) { - /* "gumbocy.pyx":150 - * tag = gumbocy.gumbo_tag_enum(tag_name) - * if tag != gumbocy.GUMBO_TAG_UNKNOWN: - * self.tags_ignore.insert( gumbocy.gumbo_tag_enum(tag_name)) # <<<<<<<<<<<<<< + /* "gumbocy.pyx":650 + * child = node.v.element.children.data[i] + * if self._traverse_node_simple(level + 1, child) == 1: + * break # <<<<<<<<<<<<<< * - * self.nodes = [] + * if node.v.element.tag == gumbocy.GUMBO_TAG_HEAD and self.head_only: */ - __pyx_t_7 = __Pyx_PyObject_AsString(__pyx_v_tag_name); if (unlikely((!__pyx_t_7) && PyErr_Occurred())) __PYX_ERR(0, 150, __pyx_L1_error) - __pyx_v_self->tags_ignore.insert(((int)gumbo_tag_enum(__pyx_t_7))); + goto __pyx_L27_break; - /* "gumbocy.pyx":149 - * for tag_name in options.get("tags_ignore", []): - * tag = gumbocy.gumbo_tag_enum(tag_name) - * if tag != gumbocy.GUMBO_TAG_UNKNOWN: # <<<<<<<<<<<<<< - * self.tags_ignore.insert( gumbocy.gumbo_tag_enum(tag_name)) + /* "gumbocy.pyx":649 + * for i in range(node.v.element.children.length): + * child = node.v.element.children.data[i] + * if self._traverse_node_simple(level + 1, child) == 1: # <<<<<<<<<<<<<< + * break * */ + } } + __pyx_L27_break:; - /* "gumbocy.pyx":147 - * self.tags_ignore_head_only.insert(gumbocy.GUMBO_TAG_SPAN) + /* "gumbocy.pyx":652 + * break + * + * if node.v.element.tag == gumbocy.GUMBO_TAG_HEAD and self.head_only: # <<<<<<<<<<<<<< + * return 1 * - * for tag_name in options.get("tags_ignore", []): # <<<<<<<<<<<<<< - * tag = gumbocy.gumbo_tag_enum(tag_name) - * if tag != gumbocy.GUMBO_TAG_UNKNOWN: */ - } - __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __pyx_t_6 = ((__pyx_v_node->v.element.tag == GUMBO_TAG_HEAD) != 0); + if (__pyx_t_6) { + } else { + __pyx_t_1 = __pyx_t_6; + goto __pyx_L30_bool_binop_done; + } + __pyx_t_6 = (__pyx_v_self->head_only != 0); + __pyx_t_1 = __pyx_t_6; + __pyx_L30_bool_binop_done:; + if (__pyx_t_1) { - /* "gumbocy.pyx":152 - * self.tags_ignore.insert( gumbocy.gumbo_tag_enum(tag_name)) + /* "gumbocy.pyx":653 * - * self.nodes = [] # <<<<<<<<<<<<<< + * if node.v.element.tag == gumbocy.GUMBO_TAG_HEAD and self.head_only: + * return 1 # <<<<<<<<<<<<<< * - * self._traverse_node(0, self.output.root) + * return 0 */ - __pyx_t_3 = PyList_New(0); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 152, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_3); - __Pyx_GIVEREF(__pyx_t_3); - __Pyx_GOTREF(__pyx_v_self->nodes); - __Pyx_DECREF(__pyx_v_self->nodes); - __pyx_v_self->nodes = ((PyObject*)__pyx_t_3); - __pyx_t_3 = 0; + __pyx_r = 1; + goto __pyx_L0; - /* "gumbocy.pyx":154 - * self.nodes = [] + /* "gumbocy.pyx":652 + * break * - * self._traverse_node(0, self.output.root) # <<<<<<<<<<<<<< + * if node.v.element.tag == gumbocy.GUMBO_TAG_HEAD and self.head_only: # <<<<<<<<<<<<<< + * return 1 * - * return self.nodes */ - ((struct __pyx_vtabstruct_7gumbocy_HTMLParser *)__pyx_v_self->__pyx_vtab)->_traverse_node(__pyx_v_self, 0, __pyx_v_self->output->root); + } - /* "gumbocy.pyx":156 - * self._traverse_node(0, self.output.root) + /* "gumbocy.pyx":584 + * self.nodes.append((level, None, node.v.text.text)) * - * return self.nodes # <<<<<<<<<<<<<< + * elif node.type == gumbocy.GUMBO_NODE_ELEMENT: # <<<<<<<<<<<<<< + * + * tag_n = node.v.element.tag + */ + break; + default: break; + } + + /* "gumbocy.pyx":655 + * return 1 + * + * return 0 # <<<<<<<<<<<<<< * * def __dealloc__(self): */ - __Pyx_XDECREF(__pyx_r); - __Pyx_INCREF(__pyx_v_self->nodes); - __pyx_r = __pyx_v_self->nodes; + __pyx_r = 0; goto __pyx_L0; - /* "gumbocy.pyx":123 - * self.output = gumbocy.gumbo_parse(self.html) + /* "gumbocy.pyx":573 + * return self.nodes * - * def listnodes(self, dict options=None): # <<<<<<<<<<<<<< - * """ Return the nodes as a flat list of tuples """ + * cdef bint _traverse_node_simple(self, int level, gumbocy.GumboNode* node): # <<<<<<<<<<<<<< + * """ Traverses the node tree. Return 1 to stop at this level """ * */ /* function exit code */ __pyx_L1_error:; - __Pyx_XDECREF(__pyx_t_1); + __Pyx_XDECREF(__pyx_t_2); __Pyx_XDECREF(__pyx_t_3); - __Pyx_AddTraceback("gumbocy.HTMLParser.listnodes", __pyx_clineno, __pyx_lineno, __pyx_filename); - __pyx_r = NULL; + __Pyx_XDECREF(__pyx_t_4); + __Pyx_XDECREF(__pyx_t_12); + __Pyx_XDECREF(__pyx_t_13); + __Pyx_XDECREF(__pyx_t_14); + __Pyx_WriteUnraisable("gumbocy.HTMLParser._traverse_node_simple", __pyx_clineno, __pyx_lineno, __pyx_filename, 0, 0); + __pyx_r = 0; __pyx_L0:; - __Pyx_XDECREF(__pyx_v_tag_name); - __Pyx_XDECREF(__pyx_v_options); - __Pyx_XGIVEREF(__pyx_r); + __Pyx_XDECREF(__pyx_v_py_tag_name); + __Pyx_XDECREF(__pyx_v_attrs); + __Pyx_XDECREF(__pyx_v_attr_name); + __Pyx_XDECREF(__pyx_v_multiple_value); + __Pyx_XDECREF(__pyx_v_v); __Pyx_RefNannyFinishContext(); return __pyx_r; } -/* "gumbocy.pyx":158 - * return self.nodes +/* "gumbocy.pyx":657 + * return 0 * * def __dealloc__(self): # <<<<<<<<<<<<<< * """ Cleanup gumbo memory when the parser is deallocated by Python """ - * gumbocy.gumbo_destroy_output(&gumbocy.kGumboDefaultOptions, self.output) + * self.free() */ /* Python wrapper */ -static void __pyx_pw_7gumbocy_10HTMLParser_7__dealloc__(PyObject *__pyx_v_self); /*proto*/ -static void __pyx_pw_7gumbocy_10HTMLParser_7__dealloc__(PyObject *__pyx_v_self) { +static void __pyx_pw_7gumbocy_10HTMLParser_9__dealloc__(PyObject *__pyx_v_self); /*proto*/ +static void __pyx_pw_7gumbocy_10HTMLParser_9__dealloc__(PyObject *__pyx_v_self) { __Pyx_RefNannyDeclarations __Pyx_RefNannySetupContext("__dealloc__ (wrapper)", 0); - __pyx_pf_7gumbocy_10HTMLParser_6__dealloc__(((struct __pyx_obj_7gumbocy_HTMLParser *)__pyx_v_self)); + __pyx_pf_7gumbocy_10HTMLParser_8__dealloc__(((struct __pyx_obj_7gumbocy_HTMLParser *)__pyx_v_self)); /* function exit code */ __Pyx_RefNannyFinishContext(); } -static void __pyx_pf_7gumbocy_10HTMLParser_6__dealloc__(struct __pyx_obj_7gumbocy_HTMLParser *__pyx_v_self) { +static void __pyx_pf_7gumbocy_10HTMLParser_8__dealloc__(struct __pyx_obj_7gumbocy_HTMLParser *__pyx_v_self) { __Pyx_RefNannyDeclarations + PyObject *__pyx_t_1 = NULL; __Pyx_RefNannySetupContext("__dealloc__", 0); - /* "gumbocy.pyx":160 + /* "gumbocy.pyx":659 * def __dealloc__(self): * """ Cleanup gumbo memory when the parser is deallocated by Python """ - * gumbocy.gumbo_destroy_output(&gumbocy.kGumboDefaultOptions, self.output) # <<<<<<<<<<<<<< + * self.free() # <<<<<<<<<<<<<< + * + * cdef free(self): */ - gumbo_destroy_output((&kGumboDefaultOptions), __pyx_v_self->output); + __pyx_t_1 = ((struct __pyx_vtabstruct_7gumbocy_HTMLParser *)__pyx_v_self->__pyx_vtab)->free(__pyx_v_self); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 659, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - /* "gumbocy.pyx":158 - * return self.nodes + /* "gumbocy.pyx":657 + * return 0 * * def __dealloc__(self): # <<<<<<<<<<<<<< * """ Cleanup gumbo memory when the parser is deallocated by Python """ - * gumbocy.gumbo_destroy_output(&gumbocy.kGumboDefaultOptions, self.output) + * self.free() + */ + + /* function exit code */ + goto __pyx_L0; + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_1); + __Pyx_WriteUnraisable("gumbocy.HTMLParser.__dealloc__", __pyx_clineno, __pyx_lineno, __pyx_filename, 0, 0); + __pyx_L0:; + __Pyx_RefNannyFinishContext(); +} + +/* "gumbocy.pyx":661 + * self.free() + * + * cdef free(self): # <<<<<<<<<<<<<< + * if self.has_output: + * gumbocy.gumbo_destroy_output(&gumbocy.kGumboDefaultOptions, self.output) + */ + +static PyObject *__pyx_f_7gumbocy_10HTMLParser_free(struct __pyx_obj_7gumbocy_HTMLParser *__pyx_v_self) { + PyObject *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + int __pyx_t_1; + __Pyx_RefNannySetupContext("free", 0); + + /* "gumbocy.pyx":662 + * + * cdef free(self): + * if self.has_output: # <<<<<<<<<<<<<< + * gumbocy.gumbo_destroy_output(&gumbocy.kGumboDefaultOptions, self.output) + * self.has_output = 0 + */ + __pyx_t_1 = (__pyx_v_self->has_output != 0); + if (__pyx_t_1) { + + /* "gumbocy.pyx":663 + * cdef free(self): + * if self.has_output: + * gumbocy.gumbo_destroy_output(&gumbocy.kGumboDefaultOptions, self.output) # <<<<<<<<<<<<<< + * self.has_output = 0 + */ + gumbo_destroy_output((&kGumboDefaultOptions), __pyx_v_self->output); + + /* "gumbocy.pyx":664 + * if self.has_output: + * gumbocy.gumbo_destroy_output(&gumbocy.kGumboDefaultOptions, self.output) + * self.has_output = 0 # <<<<<<<<<<<<<< + */ + __pyx_v_self->has_output = 0; + + /* "gumbocy.pyx":662 + * + * cdef free(self): + * if self.has_output: # <<<<<<<<<<<<<< + * gumbocy.gumbo_destroy_output(&gumbocy.kGumboDefaultOptions, self.output) + * self.has_output = 0 + */ + } + + /* "gumbocy.pyx":661 + * self.free() + * + * cdef free(self): # <<<<<<<<<<<<<< + * if self.has_output: + * gumbocy.gumbo_destroy_output(&gumbocy.kGumboDefaultOptions, self.output) */ /* function exit code */ + __pyx_r = Py_None; __Pyx_INCREF(Py_None); + __Pyx_XGIVEREF(__pyx_r); __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static PyObject *__pyx_tp_new_7gumbocy_Attributes(PyTypeObject *t, CYTHON_UNUSED PyObject *a, CYTHON_UNUSED PyObject *k) { + struct __pyx_obj_7gumbocy_Attributes *p; + PyObject *o; + if (likely((t->tp_flags & Py_TPFLAGS_IS_ABSTRACT) == 0)) { + o = (*t->tp_alloc)(t, 0); + } else { + o = (PyObject *) PyBaseObject_Type.tp_new(t, __pyx_empty_tuple, 0); + } + if (unlikely(!o)) return 0; + p = ((struct __pyx_obj_7gumbocy_Attributes *)o); + p->values = ((PyObject*)Py_None); Py_INCREF(Py_None); + p->classes = ((PyObject*)Py_None); Py_INCREF(Py_None); + return o; +} + +static void __pyx_tp_dealloc_7gumbocy_Attributes(PyObject *o) { + struct __pyx_obj_7gumbocy_Attributes *p = (struct __pyx_obj_7gumbocy_Attributes *)o; + #if PY_VERSION_HEX >= 0x030400a1 + if (unlikely(Py_TYPE(o)->tp_finalize) && !_PyGC_FINALIZED(o)) { + if (PyObject_CallFinalizerFromDealloc(o)) return; + } + #endif + PyObject_GC_UnTrack(o); + Py_CLEAR(p->values); + Py_CLEAR(p->classes); + (*Py_TYPE(o)->tp_free)(o); +} + +static int __pyx_tp_traverse_7gumbocy_Attributes(PyObject *o, visitproc v, void *a) { + int e; + struct __pyx_obj_7gumbocy_Attributes *p = (struct __pyx_obj_7gumbocy_Attributes *)o; + if (p->values) { + e = (*v)(p->values, a); if (e) return e; + } + if (p->classes) { + e = (*v)(p->classes, a); if (e) return e; + } + return 0; +} + +static int __pyx_tp_clear_7gumbocy_Attributes(PyObject *o) { + PyObject* tmp; + struct __pyx_obj_7gumbocy_Attributes *p = (struct __pyx_obj_7gumbocy_Attributes *)o; + tmp = ((PyObject*)p->values); + p->values = ((PyObject*)Py_None); Py_INCREF(Py_None); + Py_XDECREF(tmp); + tmp = ((PyObject*)p->classes); + p->classes = ((PyObject*)Py_None); Py_INCREF(Py_None); + Py_XDECREF(tmp); + return 0; } + +static PyTypeObject __pyx_type_7gumbocy_Attributes = { + PyVarObject_HEAD_INIT(0, 0) + "gumbocy.Attributes", /*tp_name*/ + sizeof(struct __pyx_obj_7gumbocy_Attributes), /*tp_basicsize*/ + 0, /*tp_itemsize*/ + __pyx_tp_dealloc_7gumbocy_Attributes, /*tp_dealloc*/ + 0, /*tp_print*/ + 0, /*tp_getattr*/ + 0, /*tp_setattr*/ + #if PY_MAJOR_VERSION < 3 + 0, /*tp_compare*/ + #endif + #if PY_MAJOR_VERSION >= 3 + 0, /*tp_as_async*/ + #endif + 0, /*tp_repr*/ + 0, /*tp_as_number*/ + 0, /*tp_as_sequence*/ + 0, /*tp_as_mapping*/ + 0, /*tp_hash*/ + 0, /*tp_call*/ + 0, /*tp_str*/ + 0, /*tp_getattro*/ + 0, /*tp_setattro*/ + 0, /*tp_as_buffer*/ + Py_TPFLAGS_DEFAULT|Py_TPFLAGS_HAVE_VERSION_TAG|Py_TPFLAGS_CHECKTYPES|Py_TPFLAGS_HAVE_NEWBUFFER|Py_TPFLAGS_BASETYPE|Py_TPFLAGS_HAVE_GC, /*tp_flags*/ + 0, /*tp_doc*/ + __pyx_tp_traverse_7gumbocy_Attributes, /*tp_traverse*/ + __pyx_tp_clear_7gumbocy_Attributes, /*tp_clear*/ + 0, /*tp_richcompare*/ + 0, /*tp_weaklistoffset*/ + 0, /*tp_iter*/ + 0, /*tp_iternext*/ + 0, /*tp_methods*/ + 0, /*tp_members*/ + 0, /*tp_getset*/ + 0, /*tp_base*/ + 0, /*tp_dict*/ + 0, /*tp_descr_get*/ + 0, /*tp_descr_set*/ + 0, /*tp_dictoffset*/ + 0, /*tp_init*/ + 0, /*tp_alloc*/ + __pyx_tp_new_7gumbocy_Attributes, /*tp_new*/ + 0, /*tp_free*/ + 0, /*tp_is_gc*/ + 0, /*tp_bases*/ + 0, /*tp_mro*/ + 0, /*tp_cache*/ + 0, /*tp_subclasses*/ + 0, /*tp_weaklist*/ + 0, /*tp_del*/ + 0, /*tp_version_tag*/ + #if PY_VERSION_HEX >= 0x030400a1 + 0, /*tp_finalize*/ + #endif +}; static struct __pyx_vtabstruct_7gumbocy_HTMLParser __pyx_vtable_7gumbocy_HTMLParser; static PyObject *__pyx_tp_new_7gumbocy_HTMLParser(PyTypeObject *t, PyObject *a, PyObject *k) { @@ -2673,10 +8844,14 @@ static PyObject *__pyx_tp_new_7gumbocy_HTMLParser(PyTypeObject *t, PyObject *a, p->__pyx_vtab = __pyx_vtabptr_7gumbocy_HTMLParser; new((void*)&(p->tags_ignore)) std::unordered_set (); new((void*)&(p->tags_ignore_head_only)) std::unordered_set (); + new((void*)&(p->tags_boilerplate)) std::unordered_set (); + new((void*)&(p->tags_boilerplate_bypass)) std::unordered_set (); + new((void*)&(p->tags_separators)) std::unordered_set (); + p->current_stack = ((PyObject*)Py_None); Py_INCREF(Py_None); + p->analysis = ((PyObject*)Py_None); Py_INCREF(Py_None); + p->current_word_group = Py_None; Py_INCREF(Py_None); + p->current_hyperlink = Py_None; Py_INCREF(Py_None); p->nodes = ((PyObject*)Py_None); Py_INCREF(Py_None); - p->classes_ignore = ((PyObject*)Py_None); Py_INCREF(Py_None); - p->attributes_whitelist = ((PyObject*)Py_None); Py_INCREF(Py_None); - p->ids_ignore = ((PyObject*)Py_None); Py_INCREF(Py_None); if (unlikely(__pyx_pw_7gumbocy_10HTMLParser_1__cinit__(o, a, k) < 0)) { Py_DECREF(o); o = 0; } @@ -2695,33 +8870,40 @@ static void __pyx_tp_dealloc_7gumbocy_HTMLParser(PyObject *o) { PyObject *etype, *eval, *etb; PyErr_Fetch(&etype, &eval, &etb); ++Py_REFCNT(o); - __pyx_pw_7gumbocy_10HTMLParser_7__dealloc__(o); + __pyx_pw_7gumbocy_10HTMLParser_9__dealloc__(o); --Py_REFCNT(o); PyErr_Restore(etype, eval, etb); } __Pyx_call_destructor(p->tags_ignore); __Pyx_call_destructor(p->tags_ignore_head_only); + __Pyx_call_destructor(p->tags_boilerplate); + __Pyx_call_destructor(p->tags_boilerplate_bypass); + __Pyx_call_destructor(p->tags_separators); + Py_CLEAR(p->current_stack); + Py_CLEAR(p->analysis); + Py_CLEAR(p->current_word_group); + Py_CLEAR(p->current_hyperlink); Py_CLEAR(p->nodes); - Py_CLEAR(p->classes_ignore); - Py_CLEAR(p->attributes_whitelist); - Py_CLEAR(p->ids_ignore); (*Py_TYPE(o)->tp_free)(o); } static int __pyx_tp_traverse_7gumbocy_HTMLParser(PyObject *o, visitproc v, void *a) { int e; struct __pyx_obj_7gumbocy_HTMLParser *p = (struct __pyx_obj_7gumbocy_HTMLParser *)o; - if (p->nodes) { - e = (*v)(p->nodes, a); if (e) return e; + if (p->current_stack) { + e = (*v)(p->current_stack, a); if (e) return e; + } + if (p->analysis) { + e = (*v)(p->analysis, a); if (e) return e; } - if (p->classes_ignore) { - e = (*v)(p->classes_ignore, a); if (e) return e; + if (p->current_word_group) { + e = (*v)(p->current_word_group, a); if (e) return e; } - if (p->attributes_whitelist) { - e = (*v)(p->attributes_whitelist, a); if (e) return e; + if (p->current_hyperlink) { + e = (*v)(p->current_hyperlink, a); if (e) return e; } - if (p->ids_ignore) { - e = (*v)(p->ids_ignore, a); if (e) return e; + if (p->nodes) { + e = (*v)(p->nodes, a); if (e) return e; } return 0; } @@ -2729,24 +8911,28 @@ static int __pyx_tp_traverse_7gumbocy_HTMLParser(PyObject *o, visitproc v, void static int __pyx_tp_clear_7gumbocy_HTMLParser(PyObject *o) { PyObject* tmp; struct __pyx_obj_7gumbocy_HTMLParser *p = (struct __pyx_obj_7gumbocy_HTMLParser *)o; - tmp = ((PyObject*)p->nodes); - p->nodes = ((PyObject*)Py_None); Py_INCREF(Py_None); + tmp = ((PyObject*)p->current_stack); + p->current_stack = ((PyObject*)Py_None); Py_INCREF(Py_None); Py_XDECREF(tmp); - tmp = ((PyObject*)p->classes_ignore); - p->classes_ignore = ((PyObject*)Py_None); Py_INCREF(Py_None); + tmp = ((PyObject*)p->analysis); + p->analysis = ((PyObject*)Py_None); Py_INCREF(Py_None); Py_XDECREF(tmp); - tmp = ((PyObject*)p->attributes_whitelist); - p->attributes_whitelist = ((PyObject*)Py_None); Py_INCREF(Py_None); + tmp = ((PyObject*)p->current_word_group); + p->current_word_group = Py_None; Py_INCREF(Py_None); Py_XDECREF(tmp); - tmp = ((PyObject*)p->ids_ignore); - p->ids_ignore = ((PyObject*)Py_None); Py_INCREF(Py_None); + tmp = ((PyObject*)p->current_hyperlink); + p->current_hyperlink = Py_None; Py_INCREF(Py_None); + Py_XDECREF(tmp); + tmp = ((PyObject*)p->nodes); + p->nodes = ((PyObject*)Py_None); Py_INCREF(Py_None); Py_XDECREF(tmp); return 0; } static PyMethodDef __pyx_methods_7gumbocy_HTMLParser[] = { - {"parse", (PyCFunction)__pyx_pw_7gumbocy_10HTMLParser_3parse, METH_NOARGS, __pyx_doc_7gumbocy_10HTMLParser_2parse}, - {"listnodes", (PyCFunction)__pyx_pw_7gumbocy_10HTMLParser_5listnodes, METH_VARARGS|METH_KEYWORDS, __pyx_doc_7gumbocy_10HTMLParser_4listnodes}, + {"parse", (PyCFunction)__pyx_pw_7gumbocy_10HTMLParser_3parse, METH_O, __pyx_doc_7gumbocy_10HTMLParser_2parse}, + {"analyze", (PyCFunction)__pyx_pw_7gumbocy_10HTMLParser_5analyze, METH_NOARGS, __pyx_doc_7gumbocy_10HTMLParser_4analyze}, + {"listnodes", (PyCFunction)__pyx_pw_7gumbocy_10HTMLParser_7listnodes, METH_NOARGS, __pyx_doc_7gumbocy_10HTMLParser_6listnodes}, {0, 0, 0, 0} }; @@ -2831,34 +9017,86 @@ static struct PyModuleDef __pyx_moduledef = { #endif static __Pyx_StringTabEntry __pyx_string_tab[] = { - {&__pyx_kp_b_, __pyx_k_, sizeof(__pyx_k_), 0, 0, 0, 0}, + {&__pyx_kp_s_, __pyx_k_, sizeof(__pyx_k_), 0, 0, 1, 0}, {&__pyx_n_s_RE_SPLIT_WHITESPACE, __pyx_k_RE_SPLIT_WHITESPACE, sizeof(__pyx_k_RE_SPLIT_WHITESPACE), 0, 0, 1, 1}, + {&__pyx_kp_s__2, __pyx_k__2, sizeof(__pyx_k__2), 0, 0, 1, 0}, + {&__pyx_kp_s__3, __pyx_k__3, sizeof(__pyx_k__3), 0, 0, 1, 0}, + {&__pyx_kp_s__4, __pyx_k__4, sizeof(__pyx_k__4), 0, 0, 1, 0}, + {&__pyx_kp_b__5, __pyx_k__5, sizeof(__pyx_k__5), 0, 0, 0, 0}, + {&__pyx_kp_s__5, __pyx_k__5, sizeof(__pyx_k__5), 0, 0, 1, 0}, + {&__pyx_kp_s__8, __pyx_k__8, sizeof(__pyx_k__8), 0, 0, 1, 0}, + {&__pyx_n_b_alt, __pyx_k_alt, sizeof(__pyx_k_alt), 0, 0, 0, 1}, + {&__pyx_n_s_analyze_external_hyperlinks, __pyx_k_analyze_external_hyperlinks, sizeof(__pyx_k_analyze_external_hyperlinks), 0, 0, 1, 1}, + {&__pyx_n_s_analyze_internal_hyperlinks, __pyx_k_analyze_internal_hyperlinks, sizeof(__pyx_k_analyze_internal_hyperlinks), 0, 0, 1, 1}, + {&__pyx_n_s_analyze_word_groups, __pyx_k_analyze_word_groups, sizeof(__pyx_k_analyze_word_groups), 0, 0, 1, 1}, + {&__pyx_n_s_append, __pyx_k_append, sizeof(__pyx_k_append), 0, 0, 1, 1}, + {&__pyx_kp_b_aria_hidden, __pyx_k_aria_hidden, sizeof(__pyx_k_aria_hidden), 0, 0, 0, 0}, + {&__pyx_n_s_article, __pyx_k_article, sizeof(__pyx_k_article), 0, 0, 1, 1}, {&__pyx_n_s_attributes_whitelist, __pyx_k_attributes_whitelist, sizeof(__pyx_k_attributes_whitelist), 0, 0, 1, 1}, + {&__pyx_n_s_base_url, __pyx_k_base_url, sizeof(__pyx_k_base_url), 0, 0, 1, 1}, {&__pyx_n_b_class, __pyx_k_class, sizeof(__pyx_k_class), 0, 0, 0, 1}, + {&__pyx_n_s_class, __pyx_k_class, sizeof(__pyx_k_class), 0, 0, 1, 1}, + {&__pyx_n_s_classes_boilerplate, __pyx_k_classes_boilerplate, sizeof(__pyx_k_classes_boilerplate), 0, 0, 1, 1}, + {&__pyx_n_s_classes_hidden, __pyx_k_classes_hidden, sizeof(__pyx_k_classes_hidden), 0, 0, 1, 1}, {&__pyx_n_s_classes_ignore, __pyx_k_classes_ignore, sizeof(__pyx_k_classes_ignore), 0, 0, 1, 1}, {&__pyx_n_s_compile, __pyx_k_compile, sizeof(__pyx_k_compile), 0, 0, 1, 1}, + {&__pyx_n_b_content, __pyx_k_content, sizeof(__pyx_k_content), 0, 0, 0, 1}, + {&__pyx_n_s_content, __pyx_k_content, sizeof(__pyx_k_content), 0, 0, 1, 1}, + {&__pyx_n_s_external_hyperlinks, __pyx_k_external_hyperlinks, sizeof(__pyx_k_external_hyperlinks), 0, 0, 1, 1}, {&__pyx_n_s_get, __pyx_k_get, sizeof(__pyx_k_get), 0, 0, 1, 1}, + {&__pyx_n_s_head_links, __pyx_k_head_links, sizeof(__pyx_k_head_links), 0, 0, 1, 1}, + {&__pyx_n_s_head_metas, __pyx_k_head_metas, sizeof(__pyx_k_head_metas), 0, 0, 1, 1}, {&__pyx_n_s_head_only, __pyx_k_head_only, sizeof(__pyx_k_head_only), 0, 0, 1, 1}, - {&__pyx_n_s_html, __pyx_k_html, sizeof(__pyx_k_html), 0, 0, 1, 1}, + {&__pyx_n_b_hidden, __pyx_k_hidden, sizeof(__pyx_k_hidden), 0, 0, 0, 1}, + {&__pyx_n_b_href, __pyx_k_href, sizeof(__pyx_k_href), 0, 0, 0, 1}, + {&__pyx_n_s_href, __pyx_k_href, sizeof(__pyx_k_href), 0, 0, 1, 1}, + {&__pyx_kp_s_http, __pyx_k_http, sizeof(__pyx_k_http), 0, 0, 1, 0}, + {&__pyx_kp_s_https, __pyx_k_https, sizeof(__pyx_k_https), 0, 0, 1, 0}, + {&__pyx_n_b_id, __pyx_k_id, sizeof(__pyx_k_id), 0, 0, 0, 1}, {&__pyx_n_s_id, __pyx_k_id, sizeof(__pyx_k_id), 0, 0, 1, 1}, + {&__pyx_n_s_ids_boilerplate, __pyx_k_ids_boilerplate, sizeof(__pyx_k_ids_boilerplate), 0, 0, 1, 1}, + {&__pyx_n_s_ids_hidden, __pyx_k_ids_hidden, sizeof(__pyx_k_ids_hidden), 0, 0, 1, 1}, {&__pyx_n_s_ids_ignore, __pyx_k_ids_ignore, sizeof(__pyx_k_ids_ignore), 0, 0, 1, 1}, {&__pyx_n_s_import, __pyx_k_import, sizeof(__pyx_k_import), 0, 0, 1, 1}, + {&__pyx_n_s_internal_hyperlinks, __pyx_k_internal_hyperlinks, sizeof(__pyx_k_internal_hyperlinks), 0, 0, 1, 1}, + {&__pyx_n_s_join, __pyx_k_join, sizeof(__pyx_k_join), 0, 0, 1, 1}, {&__pyx_n_s_lower, __pyx_k_lower, sizeof(__pyx_k_lower), 0, 0, 1, 1}, {&__pyx_n_s_main, __pyx_k_main, sizeof(__pyx_k_main), 0, 0, 1, 1}, + {&__pyx_n_s_metas_whitelist, __pyx_k_metas_whitelist, sizeof(__pyx_k_metas_whitelist), 0, 0, 1, 1}, + {&__pyx_n_b_name, __pyx_k_name, sizeof(__pyx_k_name), 0, 0, 0, 1}, + {&__pyx_n_s_name, __pyx_k_name, sizeof(__pyx_k_name), 0, 0, 1, 1}, {&__pyx_n_s_nesting_limit, __pyx_k_nesting_limit, sizeof(__pyx_k_nesting_limit), 0, 0, 1, 1}, {&__pyx_n_s_options, __pyx_k_options, sizeof(__pyx_k_options), 0, 0, 1, 1}, + {&__pyx_n_s_pop, __pyx_k_pop, sizeof(__pyx_k_pop), 0, 0, 1, 1}, + {&__pyx_n_b_property, __pyx_k_property, sizeof(__pyx_k_property), 0, 0, 0, 1}, + {&__pyx_n_s_property, __pyx_k_property, sizeof(__pyx_k_property), 0, 0, 1, 1}, {&__pyx_n_s_pyx_vtable, __pyx_k_pyx_vtable, sizeof(__pyx_k_pyx_vtable), 0, 0, 1, 1}, {&__pyx_n_s_range, __pyx_k_range, sizeof(__pyx_k_range), 0, 0, 1, 1}, {&__pyx_n_s_re, __pyx_k_re, sizeof(__pyx_k_re), 0, 0, 1, 1}, + {&__pyx_n_b_rel, __pyx_k_rel, sizeof(__pyx_k_rel), 0, 0, 0, 1}, + {&__pyx_n_s_rel, __pyx_k_rel, sizeof(__pyx_k_rel), 0, 0, 1, 1}, + {&__pyx_n_b_role, __pyx_k_role, sizeof(__pyx_k_role), 0, 0, 0, 1}, + {&__pyx_n_s_role, __pyx_k_role, sizeof(__pyx_k_role), 0, 0, 1, 1}, + {&__pyx_n_s_roles_boilerplate, __pyx_k_roles_boilerplate, sizeof(__pyx_k_roles_boilerplate), 0, 0, 1, 1}, {&__pyx_kp_s_s, __pyx_k_s, sizeof(__pyx_k_s), 0, 0, 1, 0}, + {&__pyx_n_s_setdefault, __pyx_k_setdefault, sizeof(__pyx_k_setdefault), 0, 0, 1, 1}, {&__pyx_n_s_split, __pyx_k_split, sizeof(__pyx_k_split), 0, 0, 1, 1}, + {&__pyx_n_b_src, __pyx_k_src, sizeof(__pyx_k_src), 0, 0, 0, 1}, + {&__pyx_n_s_startswith, __pyx_k_startswith, sizeof(__pyx_k_startswith), 0, 0, 1, 1}, {&__pyx_n_s_strip, __pyx_k_strip, sizeof(__pyx_k_strip), 0, 0, 1, 1}, + {&__pyx_n_b_style, __pyx_k_style, sizeof(__pyx_k_style), 0, 0, 0, 1}, + {&__pyx_n_s_tags_boilerplate, __pyx_k_tags_boilerplate, sizeof(__pyx_k_tags_boilerplate), 0, 0, 1, 1}, + {&__pyx_n_s_tags_boilerplate_bypass, __pyx_k_tags_boilerplate_bypass, sizeof(__pyx_k_tags_boilerplate_bypass), 0, 0, 1, 1}, {&__pyx_n_s_tags_ignore, __pyx_k_tags_ignore, sizeof(__pyx_k_tags_ignore), 0, 0, 1, 1}, + {&__pyx_n_s_tags_separators, __pyx_k_tags_separators, sizeof(__pyx_k_tags_separators), 0, 0, 1, 1}, {&__pyx_n_s_test, __pyx_k_test, sizeof(__pyx_k_test), 0, 0, 1, 1}, + {&__pyx_n_s_title, __pyx_k_title, sizeof(__pyx_k_title), 0, 0, 1, 1}, + {&__pyx_n_b_true, __pyx_k_true, sizeof(__pyx_k_true), 0, 0, 0, 1}, + {&__pyx_n_s_word_groups, __pyx_k_word_groups, sizeof(__pyx_k_word_groups), 0, 0, 1, 1}, {0, 0, 0, 0, 0, 0, 0} }; static int __Pyx_InitCachedBuiltins(void) { - __pyx_builtin_range = __Pyx_GetBuiltinName(__pyx_n_s_range); if (!__pyx_builtin_range) __PYX_ERR(0, 70, __pyx_L1_error) + __pyx_builtin_range = __Pyx_GetBuiltinName(__pyx_n_s_range); if (!__pyx_builtin_range) __PYX_ERR(0, 280, __pyx_L1_error) return 0; __pyx_L1_error:; return -1; @@ -2868,27 +9106,44 @@ static int __Pyx_InitCachedConstants(void) { __Pyx_RefNannyDeclarations __Pyx_RefNannySetupContext("__Pyx_InitCachedConstants", 0); - /* "gumbocy.pyx":100 + /* "gumbocy.pyx":378 + * if re2_search(href, deref(_RE2_EXTERNAL_HREF)): + * if self.analyze_external_hyperlinks: + * if href.startswith("http://") or href.startswith("https://") or href.startswith("//"): # <<<<<<<<<<<<<< + * self.analysis["external_hyperlinks"].append(tuple(self.current_hyperlink)) + * else: + */ + __pyx_tuple__6 = PyTuple_Pack(1, __pyx_kp_s_http); if (unlikely(!__pyx_tuple__6)) __PYX_ERR(0, 378, __pyx_L1_error) + __Pyx_GOTREF(__pyx_tuple__6); + __Pyx_GIVEREF(__pyx_tuple__6); + __pyx_tuple__7 = PyTuple_Pack(1, __pyx_kp_s_https); if (unlikely(!__pyx_tuple__7)) __PYX_ERR(0, 378, __pyx_L1_error) + __Pyx_GOTREF(__pyx_tuple__7); + __Pyx_GIVEREF(__pyx_tuple__7); + __pyx_tuple__9 = PyTuple_Pack(1, __pyx_kp_s__8); if (unlikely(!__pyx_tuple__9)) __PYX_ERR(0, 378, __pyx_L1_error) + __Pyx_GOTREF(__pyx_tuple__9); + __Pyx_GIVEREF(__pyx_tuple__9); + + /* "gumbocy.pyx":638 * * if self.has_ids_ignore: - * if attrs.get("id") and attrs["id"].lower() in self.ids_ignore: # <<<<<<<<<<<<<< + * if attrs.get("id") and re2_search(attrs["id"].lower(), deref(self.ids_ignore)): # <<<<<<<<<<<<<< * return 0 * */ - __pyx_tuple__2 = PyTuple_Pack(1, __pyx_n_s_id); if (unlikely(!__pyx_tuple__2)) __PYX_ERR(0, 100, __pyx_L1_error) - __Pyx_GOTREF(__pyx_tuple__2); - __Pyx_GIVEREF(__pyx_tuple__2); + __pyx_tuple__10 = PyTuple_Pack(1, __pyx_n_s_id); if (unlikely(!__pyx_tuple__10)) __PYX_ERR(0, 638, __pyx_L1_error) + __Pyx_GOTREF(__pyx_tuple__10); + __Pyx_GIVEREF(__pyx_tuple__10); - /* "gumbocy.pyx":10 - * + /* "gumbocy.pyx":23 + * cdef re2cy.RE2 *_RE2_IGNORED_HREF = new re2cy.RE2(r"^(?:javascript|mailto|ftp|about)\:") * * _RE_SPLIT_WHITESPACE = re.compile(r"\s+") # <<<<<<<<<<<<<< * - * + * ctypedef enum AttributeNames: */ - __pyx_tuple__3 = PyTuple_Pack(1, __pyx_kp_s_s); if (unlikely(!__pyx_tuple__3)) __PYX_ERR(0, 10, __pyx_L1_error) - __Pyx_GOTREF(__pyx_tuple__3); - __Pyx_GIVEREF(__pyx_tuple__3); + __pyx_tuple__11 = PyTuple_Pack(1, __pyx_kp_s_s); if (unlikely(!__pyx_tuple__11)) __PYX_ERR(0, 23, __pyx_L1_error) + __Pyx_GOTREF(__pyx_tuple__11); + __Pyx_GIVEREF(__pyx_tuple__11); __Pyx_RefNannyFinishContext(); return 0; __pyx_L1_error:; @@ -2897,6 +9152,7 @@ static int __Pyx_InitCachedConstants(void) { } static int __Pyx_InitGlobals(void) { + __pyx_umethod_PyList_Type_pop.type = (PyObject*)&PyList_Type; if (__Pyx_InitStrings(__pyx_string_tab) < 0) __PYX_ERR(0, 1, __pyx_L1_error); __pyx_int_999 = PyInt_FromLong(999); if (unlikely(!__pyx_int_999)) __PYX_ERR(0, 1, __pyx_L1_error) return 0; @@ -2913,7 +9169,9 @@ PyMODINIT_FUNC PyInit_gumbocy(void) #endif { PyObject *__pyx_t_1 = NULL; - PyObject *__pyx_t_2 = NULL; + std::vector<__pyx_t_5re2cy_ArgPtr> *__pyx_t_2; + re2::RE2 *__pyx_t_3; + PyObject *__pyx_t_4 = NULL; __Pyx_RefNannyDeclarations #if CYTHON_REFNANNY __Pyx_RefNanny = __Pyx_RefNannyImportAPI("refnanny"); @@ -2989,12 +9247,26 @@ PyMODINIT_FUNC PyInit_gumbocy(void) /*--- Variable export code ---*/ /*--- Function export code ---*/ /*--- Type init code ---*/ + if (PyType_Ready(&__pyx_type_7gumbocy_Attributes) < 0) __PYX_ERR(0, 54, __pyx_L1_error) + __pyx_type_7gumbocy_Attributes.tp_print = 0; + if (PyObject_SetAttrString(__pyx_m, "Attributes", (PyObject *)&__pyx_type_7gumbocy_Attributes) < 0) __PYX_ERR(0, 54, __pyx_L1_error) + __pyx_ptype_7gumbocy_Attributes = &__pyx_type_7gumbocy_Attributes; __pyx_vtabptr_7gumbocy_HTMLParser = &__pyx_vtable_7gumbocy_HTMLParser; - __pyx_vtable_7gumbocy_HTMLParser._traverse_node = (int (*)(struct __pyx_obj_7gumbocy_HTMLParser *, int, GumboNode *))__pyx_f_7gumbocy_10HTMLParser__traverse_node; - if (PyType_Ready(&__pyx_type_7gumbocy_HTMLParser) < 0) __PYX_ERR(0, 13, __pyx_L1_error) + __pyx_vtable_7gumbocy_HTMLParser.guess_node_hidden = (int (*)(struct __pyx_obj_7gumbocy_HTMLParser *, GumboNode *, struct __pyx_obj_7gumbocy_Attributes *))__pyx_f_7gumbocy_10HTMLParser_guess_node_hidden; + __pyx_vtable_7gumbocy_HTMLParser.guess_node_boilerplate = (int (*)(struct __pyx_obj_7gumbocy_HTMLParser *, GumboNode *, struct __pyx_obj_7gumbocy_Attributes *))__pyx_f_7gumbocy_10HTMLParser_guess_node_boilerplate; + __pyx_vtable_7gumbocy_HTMLParser.get_attributes = (struct __pyx_obj_7gumbocy_Attributes *(*)(struct __pyx_obj_7gumbocy_HTMLParser *, GumboNode *))__pyx_f_7gumbocy_10HTMLParser_get_attributes; + __pyx_vtable_7gumbocy_HTMLParser.close_word_group = (void (*)(struct __pyx_obj_7gumbocy_HTMLParser *))__pyx_f_7gumbocy_10HTMLParser_close_word_group; + __pyx_vtable_7gumbocy_HTMLParser.add_text = (void (*)(struct __pyx_obj_7gumbocy_HTMLParser *, PyObject *))__pyx_f_7gumbocy_10HTMLParser_add_text; + __pyx_vtable_7gumbocy_HTMLParser.add_hyperlink_text = (void (*)(struct __pyx_obj_7gumbocy_HTMLParser *, PyObject *))__pyx_f_7gumbocy_10HTMLParser_add_hyperlink_text; + __pyx_vtable_7gumbocy_HTMLParser.open_hyperlink = (void (*)(struct __pyx_obj_7gumbocy_HTMLParser *, struct __pyx_obj_7gumbocy_Attributes *))__pyx_f_7gumbocy_10HTMLParser_open_hyperlink; + __pyx_vtable_7gumbocy_HTMLParser.close_hyperlink = (void (*)(struct __pyx_obj_7gumbocy_HTMLParser *))__pyx_f_7gumbocy_10HTMLParser_close_hyperlink; + __pyx_vtable_7gumbocy_HTMLParser._traverse_node = (int (*)(struct __pyx_obj_7gumbocy_HTMLParser *, int, GumboNode *, int, int, int, int, int))__pyx_f_7gumbocy_10HTMLParser__traverse_node; + __pyx_vtable_7gumbocy_HTMLParser._traverse_node_simple = (int (*)(struct __pyx_obj_7gumbocy_HTMLParser *, int, GumboNode *))__pyx_f_7gumbocy_10HTMLParser__traverse_node_simple; + __pyx_vtable_7gumbocy_HTMLParser.free = (PyObject *(*)(struct __pyx_obj_7gumbocy_HTMLParser *))__pyx_f_7gumbocy_10HTMLParser_free; + if (PyType_Ready(&__pyx_type_7gumbocy_HTMLParser) < 0) __PYX_ERR(0, 65, __pyx_L1_error) __pyx_type_7gumbocy_HTMLParser.tp_print = 0; - if (__Pyx_SetVtable(__pyx_type_7gumbocy_HTMLParser.tp_dict, __pyx_vtabptr_7gumbocy_HTMLParser) < 0) __PYX_ERR(0, 13, __pyx_L1_error) - if (PyObject_SetAttrString(__pyx_m, "HTMLParser", (PyObject *)&__pyx_type_7gumbocy_HTMLParser) < 0) __PYX_ERR(0, 13, __pyx_L1_error) + if (__Pyx_SetVtable(__pyx_type_7gumbocy_HTMLParser.tp_dict, __pyx_vtabptr_7gumbocy_HTMLParser) < 0) __PYX_ERR(0, 65, __pyx_L1_error) + if (PyObject_SetAttrString(__pyx_m, "HTMLParser", (PyObject *)&__pyx_type_7gumbocy_HTMLParser) < 0) __PYX_ERR(0, 65, __pyx_L1_error) __pyx_ptype_7gumbocy_HTMLParser = &__pyx_type_7gumbocy_HTMLParser; /*--- Type import code ---*/ /*--- Variable import code ---*/ @@ -3007,35 +9279,104 @@ PyMODINIT_FUNC PyInit_gumbocy(void) /* "gumbocy.pyx":1 * import re # <<<<<<<<<<<<<< * cimport gumbocy - * from libcpp.unordered_set cimport unordered_set + * cimport re2cy */ __pyx_t_1 = __Pyx_Import(__pyx_n_s_re, 0, -1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 1, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); if (PyDict_SetItem(__pyx_d, __pyx_n_s_re, __pyx_t_1) < 0) __PYX_ERR(0, 1, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - /* "gumbocy.pyx":10 + /* "gumbocy.pyx":13 + * int printf(const char* format, ...); * + * cdef vector[re2cy.ArgPtr] *argp = new vector[re2cy.ArgPtr]() # <<<<<<<<<<<<<< + * cdef re2cy.ArgPtr *empty_args = &(deref(argp)[0]) * - * _RE_SPLIT_WHITESPACE = re.compile(r"\s+") # <<<<<<<<<<<<<< + */ + try { + __pyx_t_2 = new std::vector<__pyx_t_5re2cy_ArgPtr> (); + } catch(...) { + __Pyx_CppExn2PyErr(); + __PYX_ERR(0, 13, __pyx_L1_error) + } + __pyx_v_7gumbocy_argp = __pyx_t_2; + + /* "gumbocy.pyx":14 + * + * cdef vector[re2cy.ArgPtr] *argp = new vector[re2cy.ArgPtr]() + * cdef re2cy.ArgPtr *empty_args = &(deref(argp)[0]) # <<<<<<<<<<<<<< + * + * cdef bint re2_search(const char* s, re2cy.RE2 &pattern): + */ + __pyx_v_7gumbocy_empty_args = (&((*__pyx_v_7gumbocy_argp)[0])); + + /* "gumbocy.pyx":19 + * return re2cy.RE2.PartialMatchN(s, pattern, empty_args, 0) + * + * cdef re2cy.RE2 *_RE2_SEARCH_STYLE_HIDDEN = new re2cy.RE2(r"(display\s*\:\s*none)|(visibility\s*\:\s*hidden)") # <<<<<<<<<<<<<< + * cdef re2cy.RE2 *_RE2_EXTERNAL_HREF = new re2cy.RE2(r"^(?:[A-Za-z0-9\+\.\-]+\:)?\/\/") + * cdef re2cy.RE2 *_RE2_IGNORED_HREF = new re2cy.RE2(r"^(?:javascript|mailto|ftp|about)\:") + */ + try { + __pyx_t_3 = new re2::RE2(((char const *)"(display\\s*\\:\\s*none)|(visibility\\s*\\:\\s*hidden)")); + } catch(...) { + __Pyx_CppExn2PyErr(); + __PYX_ERR(0, 19, __pyx_L1_error) + } + __pyx_v_7gumbocy__RE2_SEARCH_STYLE_HIDDEN = __pyx_t_3; + + /* "gumbocy.pyx":20 + * + * cdef re2cy.RE2 *_RE2_SEARCH_STYLE_HIDDEN = new re2cy.RE2(r"(display\s*\:\s*none)|(visibility\s*\:\s*hidden)") + * cdef re2cy.RE2 *_RE2_EXTERNAL_HREF = new re2cy.RE2(r"^(?:[A-Za-z0-9\+\.\-]+\:)?\/\/") # <<<<<<<<<<<<<< + * cdef re2cy.RE2 *_RE2_IGNORED_HREF = new re2cy.RE2(r"^(?:javascript|mailto|ftp|about)\:") + * + */ + try { + __pyx_t_3 = new re2::RE2(((char const *)"^(?:[A-Za-z0-9\\+\\.\\-]+\\:)?\\/\\/")); + } catch(...) { + __Pyx_CppExn2PyErr(); + __PYX_ERR(0, 20, __pyx_L1_error) + } + __pyx_v_7gumbocy__RE2_EXTERNAL_HREF = __pyx_t_3; + + /* "gumbocy.pyx":21 + * cdef re2cy.RE2 *_RE2_SEARCH_STYLE_HIDDEN = new re2cy.RE2(r"(display\s*\:\s*none)|(visibility\s*\:\s*hidden)") + * cdef re2cy.RE2 *_RE2_EXTERNAL_HREF = new re2cy.RE2(r"^(?:[A-Za-z0-9\+\.\-]+\:)?\/\/") + * cdef re2cy.RE2 *_RE2_IGNORED_HREF = new re2cy.RE2(r"^(?:javascript|mailto|ftp|about)\:") # <<<<<<<<<<<<<< * + * _RE_SPLIT_WHITESPACE = re.compile(r"\s+") + */ + try { + __pyx_t_3 = new re2::RE2(((char const *)"^(?:javascript|mailto|ftp|about)\\:")); + } catch(...) { + __Pyx_CppExn2PyErr(); + __PYX_ERR(0, 21, __pyx_L1_error) + } + __pyx_v_7gumbocy__RE2_IGNORED_HREF = __pyx_t_3; + + /* "gumbocy.pyx":23 + * cdef re2cy.RE2 *_RE2_IGNORED_HREF = new re2cy.RE2(r"^(?:javascript|mailto|ftp|about)\:") + * + * _RE_SPLIT_WHITESPACE = re.compile(r"\s+") # <<<<<<<<<<<<<< * + * ctypedef enum AttributeNames: */ - __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_re); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 10, __pyx_L1_error) + __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_re); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 23, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_compile); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 10, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_2); + __pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_compile); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 23, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_4); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_tuple__3, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 10, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_4, __pyx_tuple__11, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 23, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; - if (PyDict_SetItem(__pyx_d, __pyx_n_s_RE_SPLIT_WHITESPACE, __pyx_t_1) < 0) __PYX_ERR(0, 10, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; + if (PyDict_SetItem(__pyx_d, __pyx_n_s_RE_SPLIT_WHITESPACE, __pyx_t_1) < 0) __PYX_ERR(0, 23, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; /* "gumbocy.pyx":1 * import re # <<<<<<<<<<<<<< * cimport gumbocy - * from libcpp.unordered_set cimport unordered_set + * cimport re2cy */ __pyx_t_1 = PyDict_New(); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 1, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); @@ -3047,7 +9388,7 @@ PyMODINIT_FUNC PyInit_gumbocy(void) goto __pyx_L0; __pyx_L1_error:; __Pyx_XDECREF(__pyx_t_1); - __Pyx_XDECREF(__pyx_t_2); + __Pyx_XDECREF(__pyx_t_4); if (__pyx_m) { if (__pyx_d) { __Pyx_AddTraceback("init gumbocy", __pyx_clineno, __pyx_lineno, __pyx_filename); @@ -3239,41 +9580,131 @@ static void __Pyx_RaiseArgtupleInvalid( (num_expected == 1) ? "" : "s", num_found); } -/* BytesEquals */ -static CYTHON_INLINE int __Pyx_PyBytes_Equals(PyObject* s1, PyObject* s2, int equals) { -#if CYTHON_COMPILING_IN_PYPY - return PyObject_RichCompareBool(s1, s2, equals); +/* ArgTypeTest */ +static void __Pyx_RaiseArgumentTypeInvalid(const char* name, PyObject *obj, PyTypeObject *type) { + PyErr_Format(PyExc_TypeError, + "Argument '%.200s' has incorrect type (expected %.200s, got %.200s)", + name, type->tp_name, Py_TYPE(obj)->tp_name); +} +static CYTHON_INLINE int __Pyx_ArgTypeTest(PyObject *obj, PyTypeObject *type, int none_allowed, + const char *name, int exact) +{ + if (unlikely(!type)) { + PyErr_SetString(PyExc_SystemError, "Missing type object"); + return 0; + } + if (none_allowed && obj == Py_None) return 1; + else if (exact) { + if (likely(Py_TYPE(obj) == type)) return 1; + #if PY_MAJOR_VERSION == 2 + else if ((type == &PyBaseString_Type) && likely(__Pyx_PyBaseString_CheckExact(obj))) return 1; + #endif + } + else { + if (likely(PyObject_TypeCheck(obj, type))) return 1; + } + __Pyx_RaiseArgumentTypeInvalid(name, obj, type); + return 0; +} + +/* dict_getitem_default */ +static PyObject* __Pyx_PyDict_GetItemDefault(PyObject* d, PyObject* key, PyObject* default_value) { + PyObject* value; +#if PY_MAJOR_VERSION >= 3 && !CYTHON_COMPILING_IN_PYPY + value = PyDict_GetItemWithError(d, key); + if (unlikely(!value)) { + if (unlikely(PyErr_Occurred())) + return NULL; + value = default_value; + } + Py_INCREF(value); #else - if (s1 == s2) { - return (equals == Py_EQ); - } else if (PyBytes_CheckExact(s1) & PyBytes_CheckExact(s2)) { - const char *ps1, *ps2; - Py_ssize_t length = PyBytes_GET_SIZE(s1); - if (length != PyBytes_GET_SIZE(s2)) - return (equals == Py_NE); - ps1 = PyBytes_AS_STRING(s1); - ps2 = PyBytes_AS_STRING(s2); - if (ps1[0] != ps2[0]) { - return (equals == Py_NE); - } else if (length == 1) { - return (equals == Py_EQ); - } else { - int result = memcmp(ps1, ps2, (size_t)length); - return (equals == Py_EQ) ? (result == 0) : (result != 0); + if (PyString_CheckExact(key) || PyUnicode_CheckExact(key) || PyInt_CheckExact(key)) { + value = PyDict_GetItem(d, key); + if (unlikely(!value)) { + value = default_value; } - } else if ((s1 == Py_None) & PyBytes_CheckExact(s2)) { - return (equals == Py_NE); - } else if ((s2 == Py_None) & PyBytes_CheckExact(s1)) { - return (equals == Py_NE); + Py_INCREF(value); } else { - int result; - PyObject* py_result = PyObject_RichCompare(s1, s2, equals); - if (!py_result) - return -1; - result = __Pyx_PyObject_IsTrue(py_result); - Py_DECREF(py_result); - return result; + if (default_value == Py_None) + default_value = NULL; + value = PyObject_CallMethodObjArgs( + d, __pyx_n_s_get, key, default_value, NULL); + } +#endif + return value; +} + +/* StringJoin */ +#if !CYTHON_COMPILING_IN_CPYTHON +static CYTHON_INLINE PyObject* __Pyx_PyBytes_Join(PyObject* sep, PyObject* values) { + return PyObject_CallMethodObjArgs(sep, __pyx_n_s_join, values, NULL); +} +#endif + +/* PyErrFetchRestore */ +#if CYTHON_COMPILING_IN_CPYTHON +static CYTHON_INLINE void __Pyx_ErrRestoreInState(PyThreadState *tstate, PyObject *type, PyObject *value, PyObject *tb) { + PyObject *tmp_type, *tmp_value, *tmp_tb; + tmp_type = tstate->curexc_type; + tmp_value = tstate->curexc_value; + tmp_tb = tstate->curexc_traceback; + tstate->curexc_type = type; + tstate->curexc_value = value; + tstate->curexc_traceback = tb; + Py_XDECREF(tmp_type); + Py_XDECREF(tmp_value); + Py_XDECREF(tmp_tb); +} +static CYTHON_INLINE void __Pyx_ErrFetchInState(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb) { + *type = tstate->curexc_type; + *value = tstate->curexc_value; + *tb = tstate->curexc_traceback; + tstate->curexc_type = 0; + tstate->curexc_value = 0; + tstate->curexc_traceback = 0; +} +#endif + +/* WriteUnraisableException */ +static void __Pyx_WriteUnraisable(const char *name, CYTHON_UNUSED int clineno, + CYTHON_UNUSED int lineno, CYTHON_UNUSED const char *filename, + int full_traceback, CYTHON_UNUSED int nogil) { + PyObject *old_exc, *old_val, *old_tb; + PyObject *ctx; + __Pyx_PyThreadState_declare +#ifdef WITH_THREAD + PyGILState_STATE state; + if (nogil) + state = PyGILState_Ensure(); +#ifdef _MSC_VER + else state = (PyGILState_STATE)-1; +#endif +#endif + __Pyx_PyThreadState_assign + __Pyx_ErrFetch(&old_exc, &old_val, &old_tb); + if (full_traceback) { + Py_XINCREF(old_exc); + Py_XINCREF(old_val); + Py_XINCREF(old_tb); + __Pyx_ErrRestore(old_exc, old_val, old_tb); + PyErr_PrintEx(1); + } + #if PY_MAJOR_VERSION < 3 + ctx = PyString_FromString(name); + #else + ctx = PyUnicode_FromString(name); + #endif + __Pyx_ErrRestore(old_exc, old_val, old_tb); + if (!ctx) { + PyErr_WriteUnraisable(Py_None); + } else { + PyErr_WriteUnraisable(ctx); + Py_DECREF(ctx); } +#ifdef WITH_THREAD + if (nogil) + PyGILState_Release(state); #endif } @@ -3293,109 +9724,68 @@ static CYTHON_INLINE PyObject* __Pyx_PyObject_Call(PyObject *func, PyObject *arg PyExc_SystemError, "NULL result without error in PyObject_Call"); } - return result; -} -#endif - -/* SliceObject */ -static CYTHON_INLINE PyObject* __Pyx_PyObject_GetSlice(PyObject* obj, - Py_ssize_t cstart, Py_ssize_t cstop, - PyObject** _py_start, PyObject** _py_stop, PyObject** _py_slice, - int has_cstart, int has_cstop, CYTHON_UNUSED int wraparound) { -#if CYTHON_COMPILING_IN_CPYTHON - PyMappingMethods* mp; -#if PY_MAJOR_VERSION < 3 - PySequenceMethods* ms = Py_TYPE(obj)->tp_as_sequence; - if (likely(ms && ms->sq_slice)) { - if (!has_cstart) { - if (_py_start && (*_py_start != Py_None)) { - cstart = __Pyx_PyIndex_AsSsize_t(*_py_start); - if ((cstart == (Py_ssize_t)-1) && PyErr_Occurred()) goto bad; - } else - cstart = 0; - } - if (!has_cstop) { - if (_py_stop && (*_py_stop != Py_None)) { - cstop = __Pyx_PyIndex_AsSsize_t(*_py_stop); - if ((cstop == (Py_ssize_t)-1) && PyErr_Occurred()) goto bad; - } else - cstop = PY_SSIZE_T_MAX; - } - if (wraparound && unlikely((cstart < 0) | (cstop < 0)) && likely(ms->sq_length)) { - Py_ssize_t l = ms->sq_length(obj); - if (likely(l >= 0)) { - if (cstop < 0) { - cstop += l; - if (cstop < 0) cstop = 0; - } - if (cstart < 0) { - cstart += l; - if (cstart < 0) cstart = 0; - } - } else { - if (!PyErr_ExceptionMatches(PyExc_OverflowError)) - goto bad; - PyErr_Clear(); - } - } - return ms->sq_slice(obj, cstart, cstop); - } -#endif - mp = Py_TYPE(obj)->tp_as_mapping; - if (likely(mp && mp->mp_subscript)) + return result; +} #endif - { - PyObject* result; - PyObject *py_slice, *py_start, *py_stop; - if (_py_slice) { - py_slice = *_py_slice; + +/* BytesEquals */ +static CYTHON_INLINE int __Pyx_PyBytes_Equals(PyObject* s1, PyObject* s2, int equals) { +#if CYTHON_COMPILING_IN_PYPY + return PyObject_RichCompareBool(s1, s2, equals); +#else + if (s1 == s2) { + return (equals == Py_EQ); + } else if (PyBytes_CheckExact(s1) & PyBytes_CheckExact(s2)) { + const char *ps1, *ps2; + Py_ssize_t length = PyBytes_GET_SIZE(s1); + if (length != PyBytes_GET_SIZE(s2)) + return (equals == Py_NE); + ps1 = PyBytes_AS_STRING(s1); + ps2 = PyBytes_AS_STRING(s2); + if (ps1[0] != ps2[0]) { + return (equals == Py_NE); + } else if (length == 1) { + return (equals == Py_EQ); } else { - PyObject* owned_start = NULL; - PyObject* owned_stop = NULL; - if (_py_start) { - py_start = *_py_start; - } else { - if (has_cstart) { - owned_start = py_start = PyInt_FromSsize_t(cstart); - if (unlikely(!py_start)) goto bad; - } else - py_start = Py_None; - } - if (_py_stop) { - py_stop = *_py_stop; - } else { - if (has_cstop) { - owned_stop = py_stop = PyInt_FromSsize_t(cstop); - if (unlikely(!py_stop)) { - Py_XDECREF(owned_start); - goto bad; - } - } else - py_stop = Py_None; - } - py_slice = PySlice_New(py_start, py_stop, Py_None); - Py_XDECREF(owned_start); - Py_XDECREF(owned_stop); - if (unlikely(!py_slice)) goto bad; + int result = memcmp(ps1, ps2, (size_t)length); + return (equals == Py_EQ) ? (result == 0) : (result != 0); } + } else if ((s1 == Py_None) & PyBytes_CheckExact(s2)) { + return (equals == Py_NE); + } else if ((s2 == Py_None) & PyBytes_CheckExact(s1)) { + return (equals == Py_NE); + } else { + int result; + PyObject* py_result = PyObject_RichCompare(s1, s2, equals); + if (!py_result) + return -1; + result = __Pyx_PyObject_IsTrue(py_result); + Py_DECREF(py_result); + return result; + } +#endif +} + +/* GetModuleGlobalName */ +static CYTHON_INLINE PyObject *__Pyx_GetModuleGlobalName(PyObject *name) { + PyObject *result; #if CYTHON_COMPILING_IN_CPYTHON - result = mp->mp_subscript(obj, py_slice); + result = PyDict_GetItem(__pyx_d, name); + if (likely(result)) { + Py_INCREF(result); + } else { #else - result = PyObject_GetItem(obj, py_slice); + result = PyObject_GetItem(__pyx_d, name); + if (!result) { + PyErr_Clear(); #endif - if (!_py_slice) { - Py_DECREF(py_slice); - } - return result; + result = __Pyx_GetBuiltinName(name); } - PyErr_Format(PyExc_TypeError, - "'%.200s' object is unsliceable", Py_TYPE(obj)->tp_name); -bad: - return NULL; + return result; } /* PyObjectCallMethO */ -#if CYTHON_COMPILING_IN_CPYTHON + #if CYTHON_COMPILING_IN_CPYTHON static CYTHON_INLINE PyObject* __Pyx_PyObject_CallMethO(PyObject *func, PyObject *arg) { PyObject *self, *result; PyCFunction cfunc; @@ -3415,7 +9805,7 @@ static CYTHON_INLINE PyObject* __Pyx_PyObject_CallMethO(PyObject *func, PyObject #endif /* PyObjectCallOneArg */ -#if CYTHON_COMPILING_IN_CPYTHON + #if CYTHON_COMPILING_IN_CPYTHON static PyObject* __Pyx__PyObject_CallOneArg(PyObject *func, PyObject *arg) { PyObject *result; PyObject *args = PyTuple_New(1); @@ -3450,7 +9840,7 @@ static CYTHON_INLINE PyObject* __Pyx_PyObject_CallOneArg(PyObject *func, PyObjec #endif /* PyObjectCallNoArg */ - #if CYTHON_COMPILING_IN_CPYTHON + #if CYTHON_COMPILING_IN_CPYTHON static CYTHON_INLINE PyObject* __Pyx_PyObject_CallNoArg(PyObject *func) { #ifdef __Pyx_CyFunction_USED if (likely(PyCFunction_Check(func) || PyObject_TypeCheck(func, __pyx_CyFunctionType))) { @@ -3465,147 +9855,437 @@ static CYTHON_INLINE PyObject* __Pyx_PyObject_CallNoArg(PyObject *func) { } #endif -/* GetModuleGlobalName */ - static CYTHON_INLINE PyObject *__Pyx_GetModuleGlobalName(PyObject *name) { - PyObject *result; +/* PyObjectCallMethod1 */ + static PyObject* __Pyx_PyObject_CallMethod1(PyObject* obj, PyObject* method_name, PyObject* arg) { + PyObject *method, *result = NULL; + method = __Pyx_PyObject_GetAttrStr(obj, method_name); + if (unlikely(!method)) goto bad; #if CYTHON_COMPILING_IN_CPYTHON - result = PyDict_GetItem(__pyx_d, name); - if (likely(result)) { - Py_INCREF(result); + if (likely(PyMethod_Check(method))) { + PyObject *self = PyMethod_GET_SELF(method); + if (likely(self)) { + PyObject *args; + PyObject *function = PyMethod_GET_FUNCTION(method); + args = PyTuple_New(2); + if (unlikely(!args)) goto bad; + Py_INCREF(self); + PyTuple_SET_ITEM(args, 0, self); + Py_INCREF(arg); + PyTuple_SET_ITEM(args, 1, arg); + Py_INCREF(function); + Py_DECREF(method); method = NULL; + result = __Pyx_PyObject_Call(function, args, NULL); + Py_DECREF(args); + Py_DECREF(function); + return result; + } + } +#endif + result = __Pyx_PyObject_CallOneArg(method, arg); +bad: + Py_XDECREF(method); + return result; +} + +/* append */ + static CYTHON_INLINE int __Pyx_PyObject_Append(PyObject* L, PyObject* x) { + if (likely(PyList_CheckExact(L))) { + if (unlikely(__Pyx_PyList_Append(L, x) < 0)) return -1; } else { + PyObject* retval = __Pyx_PyObject_CallMethod1(L, __pyx_n_s_append, x); + if (unlikely(!retval)) + return -1; + Py_DECREF(retval); + } + return 0; +} + +/* GetItemInt */ + static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Generic(PyObject *o, PyObject* j) { + PyObject *r; + if (!j) return NULL; + r = PyObject_GetItem(o, j); + Py_DECREF(j); + return r; +} +static CYTHON_INLINE PyObject *__Pyx_GetItemInt_List_Fast(PyObject *o, Py_ssize_t i, + CYTHON_NCP_UNUSED int wraparound, + CYTHON_NCP_UNUSED int boundscheck) { +#if CYTHON_COMPILING_IN_CPYTHON + if (wraparound & unlikely(i < 0)) i += PyList_GET_SIZE(o); + if ((!boundscheck) || likely((0 <= i) & (i < PyList_GET_SIZE(o)))) { + PyObject *r = PyList_GET_ITEM(o, i); + Py_INCREF(r); + return r; + } + return __Pyx_GetItemInt_Generic(o, PyInt_FromSsize_t(i)); #else - result = PyObject_GetItem(__pyx_d, name); - if (!result) { - PyErr_Clear(); + return PySequence_GetItem(o, i); #endif - result = __Pyx_GetBuiltinName(name); +} +static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Tuple_Fast(PyObject *o, Py_ssize_t i, + CYTHON_NCP_UNUSED int wraparound, + CYTHON_NCP_UNUSED int boundscheck) { +#if CYTHON_COMPILING_IN_CPYTHON + if (wraparound & unlikely(i < 0)) i += PyTuple_GET_SIZE(o); + if ((!boundscheck) || likely((0 <= i) & (i < PyTuple_GET_SIZE(o)))) { + PyObject *r = PyTuple_GET_ITEM(o, i); + Py_INCREF(r); + return r; } - return result; + return __Pyx_GetItemInt_Generic(o, PyInt_FromSsize_t(i)); +#else + return PySequence_GetItem(o, i); +#endif } - -/* PyErrFetchRestore */ - #if CYTHON_COMPILING_IN_CPYTHON -static CYTHON_INLINE void __Pyx_ErrRestoreInState(PyThreadState *tstate, PyObject *type, PyObject *value, PyObject *tb) { - PyObject *tmp_type, *tmp_value, *tmp_tb; - tmp_type = tstate->curexc_type; - tmp_value = tstate->curexc_value; - tmp_tb = tstate->curexc_traceback; - tstate->curexc_type = type; - tstate->curexc_value = value; - tstate->curexc_traceback = tb; - Py_XDECREF(tmp_type); - Py_XDECREF(tmp_value); - Py_XDECREF(tmp_tb); +static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Fast(PyObject *o, Py_ssize_t i, int is_list, + CYTHON_NCP_UNUSED int wraparound, + CYTHON_NCP_UNUSED int boundscheck) { +#if CYTHON_COMPILING_IN_CPYTHON + if (is_list || PyList_CheckExact(o)) { + Py_ssize_t n = ((!wraparound) | likely(i >= 0)) ? i : i + PyList_GET_SIZE(o); + if ((!boundscheck) || (likely((n >= 0) & (n < PyList_GET_SIZE(o))))) { + PyObject *r = PyList_GET_ITEM(o, n); + Py_INCREF(r); + return r; + } + } + else if (PyTuple_CheckExact(o)) { + Py_ssize_t n = ((!wraparound) | likely(i >= 0)) ? i : i + PyTuple_GET_SIZE(o); + if ((!boundscheck) || likely((n >= 0) & (n < PyTuple_GET_SIZE(o)))) { + PyObject *r = PyTuple_GET_ITEM(o, n); + Py_INCREF(r); + return r; + } + } else { + PySequenceMethods *m = Py_TYPE(o)->tp_as_sequence; + if (likely(m && m->sq_item)) { + if (wraparound && unlikely(i < 0) && likely(m->sq_length)) { + Py_ssize_t l = m->sq_length(o); + if (likely(l >= 0)) { + i += l; + } else { + if (!PyErr_ExceptionMatches(PyExc_OverflowError)) + return NULL; + PyErr_Clear(); + } + } + return m->sq_item(o, i); + } + } +#else + if (is_list || PySequence_Check(o)) { + return PySequence_GetItem(o, i); + } +#endif + return __Pyx_GetItemInt_Generic(o, PyInt_FromSsize_t(i)); } -static CYTHON_INLINE void __Pyx_ErrFetchInState(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb) { - *type = tstate->curexc_type; - *value = tstate->curexc_value; - *tb = tstate->curexc_traceback; - tstate->curexc_type = 0; - tstate->curexc_value = 0; - tstate->curexc_traceback = 0; + +/* SetItemInt */ + static CYTHON_INLINE int __Pyx_SetItemInt_Generic(PyObject *o, PyObject *j, PyObject *v) { + int r; + if (!j) return -1; + r = PyObject_SetItem(o, j, v); + Py_DECREF(j); + return r; } +static CYTHON_INLINE int __Pyx_SetItemInt_Fast(PyObject *o, Py_ssize_t i, PyObject *v, int is_list, + CYTHON_NCP_UNUSED int wraparound, CYTHON_NCP_UNUSED int boundscheck) { +#if CYTHON_COMPILING_IN_CPYTHON + if (is_list || PyList_CheckExact(o)) { + Py_ssize_t n = (!wraparound) ? i : ((likely(i >= 0)) ? i : i + PyList_GET_SIZE(o)); + if ((!boundscheck) || likely((n >= 0) & (n < PyList_GET_SIZE(o)))) { + PyObject* old = PyList_GET_ITEM(o, n); + Py_INCREF(v); + PyList_SET_ITEM(o, n, v); + Py_DECREF(old); + return 1; + } + } else { + PySequenceMethods *m = Py_TYPE(o)->tp_as_sequence; + if (likely(m && m->sq_ass_item)) { + if (wraparound && unlikely(i < 0) && likely(m->sq_length)) { + Py_ssize_t l = m->sq_length(o); + if (likely(l >= 0)) { + i += l; + } else { + if (!PyErr_ExceptionMatches(PyExc_OverflowError)) + return -1; + PyErr_Clear(); + } + } + return m->sq_ass_item(o, i, v); + } + } +#else +#if CYTHON_COMPILING_IN_PYPY + if (is_list || (PySequence_Check(o) && !PyDict_Check(o))) { +#else + if (is_list || PySequence_Check(o)) { #endif + return PySequence_SetItem(o, i, v); + } +#endif + return __Pyx_SetItemInt_Generic(o, PyInt_FromSsize_t(i), v); +} -/* WriteUnraisableException */ - static void __Pyx_WriteUnraisable(const char *name, CYTHON_UNUSED int clineno, - CYTHON_UNUSED int lineno, CYTHON_UNUSED const char *filename, - int full_traceback, CYTHON_UNUSED int nogil) { - PyObject *old_exc, *old_val, *old_tb; - PyObject *ctx; - __Pyx_PyThreadState_declare -#ifdef WITH_THREAD - PyGILState_STATE state; - if (nogil) - state = PyGILState_Ensure(); -#ifdef _MSC_VER - else state = (PyGILState_STATE)-1; +/* SliceObject */ + static CYTHON_INLINE PyObject* __Pyx_PyObject_GetSlice(PyObject* obj, + Py_ssize_t cstart, Py_ssize_t cstop, + PyObject** _py_start, PyObject** _py_stop, PyObject** _py_slice, + int has_cstart, int has_cstop, CYTHON_UNUSED int wraparound) { +#if CYTHON_COMPILING_IN_CPYTHON + PyMappingMethods* mp; +#if PY_MAJOR_VERSION < 3 + PySequenceMethods* ms = Py_TYPE(obj)->tp_as_sequence; + if (likely(ms && ms->sq_slice)) { + if (!has_cstart) { + if (_py_start && (*_py_start != Py_None)) { + cstart = __Pyx_PyIndex_AsSsize_t(*_py_start); + if ((cstart == (Py_ssize_t)-1) && PyErr_Occurred()) goto bad; + } else + cstart = 0; + } + if (!has_cstop) { + if (_py_stop && (*_py_stop != Py_None)) { + cstop = __Pyx_PyIndex_AsSsize_t(*_py_stop); + if ((cstop == (Py_ssize_t)-1) && PyErr_Occurred()) goto bad; + } else + cstop = PY_SSIZE_T_MAX; + } + if (wraparound && unlikely((cstart < 0) | (cstop < 0)) && likely(ms->sq_length)) { + Py_ssize_t l = ms->sq_length(obj); + if (likely(l >= 0)) { + if (cstop < 0) { + cstop += l; + if (cstop < 0) cstop = 0; + } + if (cstart < 0) { + cstart += l; + if (cstart < 0) cstart = 0; + } + } else { + if (!PyErr_ExceptionMatches(PyExc_OverflowError)) + goto bad; + PyErr_Clear(); + } + } + return ms->sq_slice(obj, cstart, cstop); + } +#endif + mp = Py_TYPE(obj)->tp_as_mapping; + if (likely(mp && mp->mp_subscript)) #endif + { + PyObject* result; + PyObject *py_slice, *py_start, *py_stop; + if (_py_slice) { + py_slice = *_py_slice; + } else { + PyObject* owned_start = NULL; + PyObject* owned_stop = NULL; + if (_py_start) { + py_start = *_py_start; + } else { + if (has_cstart) { + owned_start = py_start = PyInt_FromSsize_t(cstart); + if (unlikely(!py_start)) goto bad; + } else + py_start = Py_None; + } + if (_py_stop) { + py_stop = *_py_stop; + } else { + if (has_cstop) { + owned_stop = py_stop = PyInt_FromSsize_t(cstop); + if (unlikely(!py_stop)) { + Py_XDECREF(owned_start); + goto bad; + } + } else + py_stop = Py_None; + } + py_slice = PySlice_New(py_start, py_stop, Py_None); + Py_XDECREF(owned_start); + Py_XDECREF(owned_stop); + if (unlikely(!py_slice)) goto bad; + } +#if CYTHON_COMPILING_IN_CPYTHON + result = mp->mp_subscript(obj, py_slice); +#else + result = PyObject_GetItem(obj, py_slice); #endif - __Pyx_PyThreadState_assign - __Pyx_ErrFetch(&old_exc, &old_val, &old_tb); - if (full_traceback) { - Py_XINCREF(old_exc); - Py_XINCREF(old_val); - Py_XINCREF(old_tb); - __Pyx_ErrRestore(old_exc, old_val, old_tb); - PyErr_PrintEx(1); - } - #if PY_MAJOR_VERSION < 3 - ctx = PyString_FromString(name); - #else - ctx = PyUnicode_FromString(name); - #endif - __Pyx_ErrRestore(old_exc, old_val, old_tb); - if (!ctx) { - PyErr_WriteUnraisable(Py_None); - } else { - PyErr_WriteUnraisable(ctx); - Py_DECREF(ctx); + if (!_py_slice) { + Py_DECREF(py_slice); + } + return result; } -#ifdef WITH_THREAD - if (nogil) - PyGILState_Release(state); -#endif -} - -/* ArgTypeTest */ - static void __Pyx_RaiseArgumentTypeInvalid(const char* name, PyObject *obj, PyTypeObject *type) { PyErr_Format(PyExc_TypeError, - "Argument '%.200s' has incorrect type (expected %.200s, got %.200s)", - name, type->tp_name, Py_TYPE(obj)->tp_name); + "'%.200s' object is unsliceable", Py_TYPE(obj)->tp_name); +bad: + return NULL; } -static CYTHON_INLINE int __Pyx_ArgTypeTest(PyObject *obj, PyTypeObject *type, int none_allowed, - const char *name, int exact) -{ - if (unlikely(!type)) { - PyErr_SetString(PyExc_SystemError, "Missing type object"); - return 0; - } - if (none_allowed && obj == Py_None) return 1; - else if (exact) { - if (likely(Py_TYPE(obj) == type)) return 1; - #if PY_MAJOR_VERSION == 2 - else if ((type == &PyBaseString_Type) && likely(__Pyx_PyBaseString_CheckExact(obj))) return 1; - #endif - } - else { - if (likely(PyObject_TypeCheck(obj, type))) return 1; + +/* PyObjectCallMethod2 */ + static PyObject* __Pyx_PyObject_CallMethod2(PyObject* obj, PyObject* method_name, PyObject* arg1, PyObject* arg2) { + PyObject *args, *method, *result = NULL; + method = __Pyx_PyObject_GetAttrStr(obj, method_name); +#if CYTHON_COMPILING_IN_CPYTHON + if (likely(PyMethod_Check(method)) && likely(PyMethod_GET_SELF(method))) { + PyObject *self, *function; + self = PyMethod_GET_SELF(method); + function = PyMethod_GET_FUNCTION(method); + args = PyTuple_New(3); + if (unlikely(!args)) goto bad; + Py_INCREF(self); + PyTuple_SET_ITEM(args, 0, self); + Py_INCREF(arg1); + PyTuple_SET_ITEM(args, 1, arg1); + Py_INCREF(arg2); + PyTuple_SET_ITEM(args, 2, arg2); + Py_INCREF(function); + Py_DECREF(method); + method = function; + } else +#endif + { + args = PyTuple_New(2); + if (unlikely(!args)) goto bad; + Py_INCREF(arg1); + PyTuple_SET_ITEM(args, 0, arg1); + Py_INCREF(arg2); + PyTuple_SET_ITEM(args, 1, arg2); } - __Pyx_RaiseArgumentTypeInvalid(name, obj, type); - return 0; + result = __Pyx_PyObject_Call(method, args, NULL); + Py_DECREF(args); + Py_DECREF(method); + return result; +bad: + Py_XDECREF(method); + return result; } -/* dict_getitem_default */ - static PyObject* __Pyx_PyDict_GetItemDefault(PyObject* d, PyObject* key, PyObject* default_value) { +/* dict_setdefault */ + static CYTHON_INLINE PyObject *__Pyx_PyDict_SetDefault(PyObject *d, PyObject *key, PyObject *default_value, + CYTHON_UNUSED int is_safe_type) { PyObject* value; +#if PY_VERSION_HEX >= 0x030400A0 + if (1) { + value = PyDict_SetDefault(d, key, default_value); + if (unlikely(!value)) return NULL; + Py_INCREF(value); +#else + if (is_safe_type == 1 || (is_safe_type == -1 && #if PY_MAJOR_VERSION >= 3 && !CYTHON_COMPILING_IN_PYPY - value = PyDict_GetItemWithError(d, key); - if (unlikely(!value)) { - if (unlikely(PyErr_Occurred())) - return NULL; - value = default_value; - } - Py_INCREF(value); + (PyUnicode_CheckExact(key) || PyString_CheckExact(key) || PyLong_CheckExact(key)))) { + value = PyDict_GetItemWithError(d, key); + if (unlikely(!value)) { + if (unlikely(PyErr_Occurred())) + return NULL; + if (unlikely(PyDict_SetItem(d, key, default_value) == -1)) + return NULL; + value = default_value; + } + Py_INCREF(value); #else - if (PyString_CheckExact(key) || PyUnicode_CheckExact(key) || PyInt_CheckExact(key)) { + (PyString_CheckExact(key) || PyUnicode_CheckExact(key) || PyInt_CheckExact(key) || PyLong_CheckExact(key)))) { value = PyDict_GetItem(d, key); if (unlikely(!value)) { + if (unlikely(PyDict_SetItem(d, key, default_value) == -1)) + return NULL; value = default_value; } Py_INCREF(value); +#endif +#endif } else { - if (default_value == Py_None) - default_value = NULL; - value = PyObject_CallMethodObjArgs( - d, __pyx_n_s_get, key, default_value, NULL); + value = __Pyx_PyObject_CallMethod2(d, __pyx_n_s_setdefault, key, default_value); } -#endif return value; } +/* PyObjectCallMethod0 */ + static PyObject* __Pyx_PyObject_CallMethod0(PyObject* obj, PyObject* method_name) { + PyObject *method, *result = NULL; + method = __Pyx_PyObject_GetAttrStr(obj, method_name); + if (unlikely(!method)) goto bad; +#if CYTHON_COMPILING_IN_CPYTHON + if (likely(PyMethod_Check(method))) { + PyObject *self = PyMethod_GET_SELF(method); + if (likely(self)) { + PyObject *function = PyMethod_GET_FUNCTION(method); + result = __Pyx_PyObject_CallOneArg(function, self); + Py_DECREF(method); + return result; + } + } +#endif + result = __Pyx_PyObject_CallNoArg(method); + Py_DECREF(method); +bad: + return result; +} + +/* UnpackUnboundCMethod */ + static int __Pyx_TryUnpackUnboundCMethod(__Pyx_CachedCFunction* target) { + PyObject *method; + method = __Pyx_PyObject_GetAttrStr(target->type, *target->method_name); + if (unlikely(!method)) + return -1; + target->method = method; +#if CYTHON_COMPILING_IN_CPYTHON + #if PY_MAJOR_VERSION >= 3 + if (likely(PyObject_TypeCheck(method, &PyMethodDescr_Type))) + #endif + { + PyMethodDescrObject *descr = (PyMethodDescrObject*) method; + target->func = descr->d_method->ml_meth; + target->flag = descr->d_method->ml_flags & (METH_VARARGS | METH_KEYWORDS | METH_O | METH_NOARGS); + } +#endif + return 0; +} + +/* CallUnboundCMethod0 */ + static PyObject* __Pyx__CallUnboundCMethod0(__Pyx_CachedCFunction* cfunc, PyObject* self) { + PyObject *args, *result = NULL; + if (unlikely(!cfunc->method) && unlikely(__Pyx_TryUnpackUnboundCMethod(cfunc) < 0)) return NULL; +#if CYTHON_COMPILING_IN_CPYTHON + args = PyTuple_New(1); + if (unlikely(!args)) goto bad; + Py_INCREF(self); + PyTuple_SET_ITEM(args, 0, self); +#else + args = PyTuple_Pack(1, self); + if (unlikely(!args)) goto bad; +#endif + result = __Pyx_PyObject_Call(cfunc->method, args, NULL); + Py_DECREF(args); +bad: + return result; +} + +/* pop */ + static CYTHON_INLINE PyObject* __Pyx__PyObject_Pop(PyObject* L) { +#if CYTHON_COMPILING_IN_CPYTHON + if (Py_TYPE(L) == &PySet_Type) { + return PySet_Pop(L); + } +#endif + return __Pyx_PyObject_CallMethod0(L, __pyx_n_s_pop); +} +#if CYTHON_COMPILING_IN_CPYTHON +static CYTHON_INLINE PyObject* __Pyx_PyList_Pop(PyObject* L) { + if (likely(PyList_GET_SIZE(L) > (((PyListObject*)L)->allocated >> 1))) { + Py_SIZE(L) -= 1; + return PyList_GET_ITEM(L, PyList_GET_SIZE(L)); + } + return __Pyx_CallUnboundCMethod0(&__pyx_umethod_PyList_Type_pop, L); +} +#endif + /* SetVTable */ - static int __Pyx_SetVtable(PyObject *dict, void *vtable) { + static int __Pyx_SetVtable(PyObject *dict, void *vtable) { #if PY_VERSION_HEX >= 0x02070000 PyObject *ob = PyCapsule_New(vtable, 0, 0); #else @@ -3623,7 +10303,7 @@ static CYTHON_INLINE int __Pyx_ArgTypeTest(PyObject *obj, PyTypeObject *type, in } /* Import */ - static PyObject *__Pyx_Import(PyObject *name, PyObject *from_list, int level) { + static PyObject *__Pyx_Import(PyObject *name, PyObject *from_list, int level) { PyObject *empty_list = 0; PyObject *module = 0; PyObject *global_dict = 0; @@ -3697,7 +10377,7 @@ static CYTHON_INLINE int __Pyx_ArgTypeTest(PyObject *obj, PyTypeObject *type, in } /* CodeObjectCache */ - static int __pyx_bisect_code_objects(__Pyx_CodeObjectCacheEntry* entries, int count, int code_line) { + static int __pyx_bisect_code_objects(__Pyx_CodeObjectCacheEntry* entries, int count, int code_line) { int start = 0, mid = 0, end = count - 1; if (end >= 0 && code_line > entries[end].code_line) { return count; @@ -3777,7 +10457,7 @@ static void __pyx_insert_code_object(int code_line, PyCodeObject* code_object) { } /* AddTraceback */ - #include "compile.h" + #include "compile.h" #include "frameobject.h" #include "traceback.h" static PyCodeObject* __Pyx_CreateCodeObjectForTraceback( @@ -3857,35 +10537,57 @@ static void __Pyx_AddTraceback(const char *funcname, int c_line, Py_XDECREF(py_frame); } +/* CIntFromPyVerify */ + #define __PYX_VERIFY_RETURN_INT(target_type, func_type, func_value)\ + __PYX__VERIFY_RETURN_INT(target_type, func_type, func_value, 0) +#define __PYX_VERIFY_RETURN_INT_EXC(target_type, func_type, func_value)\ + __PYX__VERIFY_RETURN_INT(target_type, func_type, func_value, 1) +#define __PYX__VERIFY_RETURN_INT(target_type, func_type, func_value, exc)\ + {\ + func_type value = func_value;\ + if (sizeof(target_type) < sizeof(func_type)) {\ + if (unlikely(value != (func_type) (target_type) value)) {\ + func_type zero = 0;\ + if (exc && unlikely(value == (func_type)-1 && PyErr_Occurred()))\ + return (target_type) -1;\ + if (is_unsigned && unlikely(value < zero))\ + goto raise_neg_overflow;\ + else\ + goto raise_overflow;\ + }\ + }\ + return (target_type) value;\ + } + /* CIntToPy */ - static CYTHON_INLINE PyObject* __Pyx_PyInt_From_int(int value) { - const int neg_one = (int) -1, const_zero = (int) 0; + static CYTHON_INLINE PyObject* __Pyx_PyInt_From___pyx_t_7gumbocy_AttributeNames(__pyx_t_7gumbocy_AttributeNames value) { + const __pyx_t_7gumbocy_AttributeNames neg_one = (__pyx_t_7gumbocy_AttributeNames) -1, const_zero = (__pyx_t_7gumbocy_AttributeNames) 0; const int is_unsigned = neg_one > const_zero; if (is_unsigned) { - if (sizeof(int) < sizeof(long)) { + if (sizeof(__pyx_t_7gumbocy_AttributeNames) < sizeof(long)) { return PyInt_FromLong((long) value); - } else if (sizeof(int) <= sizeof(unsigned long)) { + } else if (sizeof(__pyx_t_7gumbocy_AttributeNames) <= sizeof(unsigned long)) { return PyLong_FromUnsignedLong((unsigned long) value); - } else if (sizeof(int) <= sizeof(unsigned PY_LONG_LONG)) { + } else if (sizeof(__pyx_t_7gumbocy_AttributeNames) <= sizeof(unsigned PY_LONG_LONG)) { return PyLong_FromUnsignedLongLong((unsigned PY_LONG_LONG) value); } } else { - if (sizeof(int) <= sizeof(long)) { + if (sizeof(__pyx_t_7gumbocy_AttributeNames) <= sizeof(long)) { return PyInt_FromLong((long) value); - } else if (sizeof(int) <= sizeof(PY_LONG_LONG)) { + } else if (sizeof(__pyx_t_7gumbocy_AttributeNames) <= sizeof(PY_LONG_LONG)) { return PyLong_FromLongLong((PY_LONG_LONG) value); } } { int one = 1; int little = (int)*(unsigned char *)&one; unsigned char *bytes = (unsigned char *)&value; - return _PyLong_FromByteArray(bytes, sizeof(int), + return _PyLong_FromByteArray(bytes, sizeof(__pyx_t_7gumbocy_AttributeNames), little, !is_unsigned); } } /* CIntToPy */ - static CYTHON_INLINE PyObject* __Pyx_PyInt_From_unsigned_int(unsigned int value) { + static CYTHON_INLINE PyObject* __Pyx_PyInt_From_unsigned_int(unsigned int value) { const unsigned int neg_one = (unsigned int) -1, const_zero = (unsigned int) 0; const int is_unsigned = neg_one > const_zero; if (is_unsigned) { @@ -3911,42 +10613,74 @@ static void __Pyx_AddTraceback(const char *funcname, int c_line, } } -/* CIntFromPyVerify */ - #define __PYX_VERIFY_RETURN_INT(target_type, func_type, func_value)\ - __PYX__VERIFY_RETURN_INT(target_type, func_type, func_value, 0) -#define __PYX_VERIFY_RETURN_INT_EXC(target_type, func_type, func_value)\ - __PYX__VERIFY_RETURN_INT(target_type, func_type, func_value, 1) -#define __PYX__VERIFY_RETURN_INT(target_type, func_type, func_value, exc)\ - {\ - func_type value = func_value;\ - if (sizeof(target_type) < sizeof(func_type)) {\ - if (unlikely(value != (func_type) (target_type) value)) {\ - func_type zero = 0;\ - if (exc && unlikely(value == (func_type)-1 && PyErr_Occurred()))\ - return (target_type) -1;\ - if (is_unsigned && unlikely(value < zero))\ - goto raise_neg_overflow;\ - else\ - goto raise_overflow;\ - }\ - }\ - return (target_type) value;\ +/* CIntToPy */ + static CYTHON_INLINE PyObject* __Pyx_PyInt_From_long(long value) { + const long neg_one = (long) -1, const_zero = (long) 0; + const int is_unsigned = neg_one > const_zero; + if (is_unsigned) { + if (sizeof(long) < sizeof(long)) { + return PyInt_FromLong((long) value); + } else if (sizeof(long) <= sizeof(unsigned long)) { + return PyLong_FromUnsignedLong((unsigned long) value); + } else if (sizeof(long) <= sizeof(unsigned PY_LONG_LONG)) { + return PyLong_FromUnsignedLongLong((unsigned PY_LONG_LONG) value); + } + } else { + if (sizeof(long) <= sizeof(long)) { + return PyInt_FromLong((long) value); + } else if (sizeof(long) <= sizeof(PY_LONG_LONG)) { + return PyLong_FromLongLong((PY_LONG_LONG) value); + } + } + { + int one = 1; int little = (int)*(unsigned char *)&one; + unsigned char *bytes = (unsigned char *)&value; + return _PyLong_FromByteArray(bytes, sizeof(long), + little, !is_unsigned); + } +} + +/* CIntToPy */ + static CYTHON_INLINE PyObject* __Pyx_PyInt_From_int(int value) { + const int neg_one = (int) -1, const_zero = (int) 0; + const int is_unsigned = neg_one > const_zero; + if (is_unsigned) { + if (sizeof(int) < sizeof(long)) { + return PyInt_FromLong((long) value); + } else if (sizeof(int) <= sizeof(unsigned long)) { + return PyLong_FromUnsignedLong((unsigned long) value); + } else if (sizeof(int) <= sizeof(unsigned PY_LONG_LONG)) { + return PyLong_FromUnsignedLongLong((unsigned PY_LONG_LONG) value); + } + } else { + if (sizeof(int) <= sizeof(long)) { + return PyInt_FromLong((long) value); + } else if (sizeof(int) <= sizeof(PY_LONG_LONG)) { + return PyLong_FromLongLong((PY_LONG_LONG) value); + } + } + { + int one = 1; int little = (int)*(unsigned char *)&one; + unsigned char *bytes = (unsigned char *)&value; + return _PyLong_FromByteArray(bytes, sizeof(int), + little, !is_unsigned); } +} /* CIntFromPy */ - static CYTHON_INLINE unsigned int __Pyx_PyInt_As_unsigned_int(PyObject *x) { - const unsigned int neg_one = (unsigned int) -1, const_zero = (unsigned int) 0; + static CYTHON_INLINE int __Pyx_PyInt_As_int(PyObject *x) { + const int neg_one = (int) -1, const_zero = (int) 0; const int is_unsigned = neg_one > const_zero; #if PY_MAJOR_VERSION < 3 if (likely(PyInt_Check(x))) { - if (sizeof(unsigned int) < sizeof(long)) { - __PYX_VERIFY_RETURN_INT(unsigned int, long, PyInt_AS_LONG(x)) + if (sizeof(int) < sizeof(long)) { + __PYX_VERIFY_RETURN_INT(int, long, PyInt_AS_LONG(x)) } else { long val = PyInt_AS_LONG(x); if (is_unsigned && unlikely(val < 0)) { goto raise_neg_overflow; } - return (unsigned int) val; + return (int) val; } } else #endif @@ -3955,32 +10689,32 @@ static void __Pyx_AddTraceback(const char *funcname, int c_line, #if CYTHON_USE_PYLONG_INTERNALS const digit* digits = ((PyLongObject*)x)->ob_digit; switch (Py_SIZE(x)) { - case 0: return (unsigned int) 0; - case 1: __PYX_VERIFY_RETURN_INT(unsigned int, digit, digits[0]) + case 0: return (int) 0; + case 1: __PYX_VERIFY_RETURN_INT(int, digit, digits[0]) case 2: - if (8 * sizeof(unsigned int) > 1 * PyLong_SHIFT) { + if (8 * sizeof(int) > 1 * PyLong_SHIFT) { if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) { - __PYX_VERIFY_RETURN_INT(unsigned int, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) - } else if (8 * sizeof(unsigned int) >= 2 * PyLong_SHIFT) { - return (unsigned int) (((((unsigned int)digits[1]) << PyLong_SHIFT) | (unsigned int)digits[0])); + __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) + } else if (8 * sizeof(int) >= 2 * PyLong_SHIFT) { + return (int) (((((int)digits[1]) << PyLong_SHIFT) | (int)digits[0])); } } break; case 3: - if (8 * sizeof(unsigned int) > 2 * PyLong_SHIFT) { + if (8 * sizeof(int) > 2 * PyLong_SHIFT) { if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) { - __PYX_VERIFY_RETURN_INT(unsigned int, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) - } else if (8 * sizeof(unsigned int) >= 3 * PyLong_SHIFT) { - return (unsigned int) (((((((unsigned int)digits[2]) << PyLong_SHIFT) | (unsigned int)digits[1]) << PyLong_SHIFT) | (unsigned int)digits[0])); + __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) + } else if (8 * sizeof(int) >= 3 * PyLong_SHIFT) { + return (int) (((((((int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0])); } } break; case 4: - if (8 * sizeof(unsigned int) > 3 * PyLong_SHIFT) { + if (8 * sizeof(int) > 3 * PyLong_SHIFT) { if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) { - __PYX_VERIFY_RETURN_INT(unsigned int, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) - } else if (8 * sizeof(unsigned int) >= 4 * PyLong_SHIFT) { - return (unsigned int) (((((((((unsigned int)digits[3]) << PyLong_SHIFT) | (unsigned int)digits[2]) << PyLong_SHIFT) | (unsigned int)digits[1]) << PyLong_SHIFT) | (unsigned int)digits[0])); + __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) + } else if (8 * sizeof(int) >= 4 * PyLong_SHIFT) { + return (int) (((((((((int)digits[3]) << PyLong_SHIFT) | (int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0])); } } break; @@ -3994,83 +10728,83 @@ static void __Pyx_AddTraceback(const char *funcname, int c_line, { int result = PyObject_RichCompareBool(x, Py_False, Py_LT); if (unlikely(result < 0)) - return (unsigned int) -1; + return (int) -1; if (unlikely(result == 1)) goto raise_neg_overflow; } #endif - if (sizeof(unsigned int) <= sizeof(unsigned long)) { - __PYX_VERIFY_RETURN_INT_EXC(unsigned int, unsigned long, PyLong_AsUnsignedLong(x)) - } else if (sizeof(unsigned int) <= sizeof(unsigned PY_LONG_LONG)) { - __PYX_VERIFY_RETURN_INT_EXC(unsigned int, unsigned PY_LONG_LONG, PyLong_AsUnsignedLongLong(x)) + if (sizeof(int) <= sizeof(unsigned long)) { + __PYX_VERIFY_RETURN_INT_EXC(int, unsigned long, PyLong_AsUnsignedLong(x)) + } else if (sizeof(int) <= sizeof(unsigned PY_LONG_LONG)) { + __PYX_VERIFY_RETURN_INT_EXC(int, unsigned PY_LONG_LONG, PyLong_AsUnsignedLongLong(x)) } } else { #if CYTHON_USE_PYLONG_INTERNALS const digit* digits = ((PyLongObject*)x)->ob_digit; switch (Py_SIZE(x)) { - case 0: return (unsigned int) 0; - case -1: __PYX_VERIFY_RETURN_INT(unsigned int, sdigit, (sdigit) (-(sdigit)digits[0])) - case 1: __PYX_VERIFY_RETURN_INT(unsigned int, digit, +digits[0]) + case 0: return (int) 0; + case -1: __PYX_VERIFY_RETURN_INT(int, sdigit, (sdigit) (-(sdigit)digits[0])) + case 1: __PYX_VERIFY_RETURN_INT(int, digit, +digits[0]) case -2: - if (8 * sizeof(unsigned int) - 1 > 1 * PyLong_SHIFT) { + if (8 * sizeof(int) - 1 > 1 * PyLong_SHIFT) { if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) { - __PYX_VERIFY_RETURN_INT(unsigned int, long, -(long) (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) - } else if (8 * sizeof(unsigned int) - 1 > 2 * PyLong_SHIFT) { - return (unsigned int) (((unsigned int)-1)*(((((unsigned int)digits[1]) << PyLong_SHIFT) | (unsigned int)digits[0]))); + __PYX_VERIFY_RETURN_INT(int, long, -(long) (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) + } else if (8 * sizeof(int) - 1 > 2 * PyLong_SHIFT) { + return (int) (((int)-1)*(((((int)digits[1]) << PyLong_SHIFT) | (int)digits[0]))); } } break; case 2: - if (8 * sizeof(unsigned int) > 1 * PyLong_SHIFT) { + if (8 * sizeof(int) > 1 * PyLong_SHIFT) { if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) { - __PYX_VERIFY_RETURN_INT(unsigned int, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) - } else if (8 * sizeof(unsigned int) - 1 > 2 * PyLong_SHIFT) { - return (unsigned int) ((((((unsigned int)digits[1]) << PyLong_SHIFT) | (unsigned int)digits[0]))); + __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) + } else if (8 * sizeof(int) - 1 > 2 * PyLong_SHIFT) { + return (int) ((((((int)digits[1]) << PyLong_SHIFT) | (int)digits[0]))); } } break; case -3: - if (8 * sizeof(unsigned int) - 1 > 2 * PyLong_SHIFT) { + if (8 * sizeof(int) - 1 > 2 * PyLong_SHIFT) { if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) { - __PYX_VERIFY_RETURN_INT(unsigned int, long, -(long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) - } else if (8 * sizeof(unsigned int) - 1 > 3 * PyLong_SHIFT) { - return (unsigned int) (((unsigned int)-1)*(((((((unsigned int)digits[2]) << PyLong_SHIFT) | (unsigned int)digits[1]) << PyLong_SHIFT) | (unsigned int)digits[0]))); + __PYX_VERIFY_RETURN_INT(int, long, -(long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) + } else if (8 * sizeof(int) - 1 > 3 * PyLong_SHIFT) { + return (int) (((int)-1)*(((((((int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0]))); } } break; case 3: - if (8 * sizeof(unsigned int) > 2 * PyLong_SHIFT) { + if (8 * sizeof(int) > 2 * PyLong_SHIFT) { if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) { - __PYX_VERIFY_RETURN_INT(unsigned int, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) - } else if (8 * sizeof(unsigned int) - 1 > 3 * PyLong_SHIFT) { - return (unsigned int) ((((((((unsigned int)digits[2]) << PyLong_SHIFT) | (unsigned int)digits[1]) << PyLong_SHIFT) | (unsigned int)digits[0]))); + __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) + } else if (8 * sizeof(int) - 1 > 3 * PyLong_SHIFT) { + return (int) ((((((((int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0]))); } } break; case -4: - if (8 * sizeof(unsigned int) - 1 > 3 * PyLong_SHIFT) { + if (8 * sizeof(int) - 1 > 3 * PyLong_SHIFT) { if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) { - __PYX_VERIFY_RETURN_INT(unsigned int, long, -(long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) - } else if (8 * sizeof(unsigned int) - 1 > 4 * PyLong_SHIFT) { - return (unsigned int) (((unsigned int)-1)*(((((((((unsigned int)digits[3]) << PyLong_SHIFT) | (unsigned int)digits[2]) << PyLong_SHIFT) | (unsigned int)digits[1]) << PyLong_SHIFT) | (unsigned int)digits[0]))); + __PYX_VERIFY_RETURN_INT(int, long, -(long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) + } else if (8 * sizeof(int) - 1 > 4 * PyLong_SHIFT) { + return (int) (((int)-1)*(((((((((int)digits[3]) << PyLong_SHIFT) | (int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0]))); } } break; case 4: - if (8 * sizeof(unsigned int) > 3 * PyLong_SHIFT) { + if (8 * sizeof(int) > 3 * PyLong_SHIFT) { if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) { - __PYX_VERIFY_RETURN_INT(unsigned int, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) - } else if (8 * sizeof(unsigned int) - 1 > 4 * PyLong_SHIFT) { - return (unsigned int) ((((((((((unsigned int)digits[3]) << PyLong_SHIFT) | (unsigned int)digits[2]) << PyLong_SHIFT) | (unsigned int)digits[1]) << PyLong_SHIFT) | (unsigned int)digits[0]))); + __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) + } else if (8 * sizeof(int) - 1 > 4 * PyLong_SHIFT) { + return (int) ((((((((((int)digits[3]) << PyLong_SHIFT) | (int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0]))); } } break; } #endif - if (sizeof(unsigned int) <= sizeof(long)) { - __PYX_VERIFY_RETURN_INT_EXC(unsigned int, long, PyLong_AsLong(x)) - } else if (sizeof(unsigned int) <= sizeof(PY_LONG_LONG)) { - __PYX_VERIFY_RETURN_INT_EXC(unsigned int, PY_LONG_LONG, PyLong_AsLongLong(x)) + if (sizeof(int) <= sizeof(long)) { + __PYX_VERIFY_RETURN_INT_EXC(int, long, PyLong_AsLong(x)) + } else if (sizeof(int) <= sizeof(PY_LONG_LONG)) { + __PYX_VERIFY_RETURN_INT_EXC(int, PY_LONG_LONG, PyLong_AsLongLong(x)) } } { @@ -4078,7 +10812,7 @@ static void __Pyx_AddTraceback(const char *funcname, int c_line, PyErr_SetString(PyExc_RuntimeError, "_PyLong_AsByteArray() not available in PyPy, cannot convert large numbers"); #else - unsigned int val; + int val; PyObject *v = __Pyx_PyNumber_IntOrLong(x); #if PY_MAJOR_VERSION < 3 if (likely(v) && !PyLong_Check(v)) { @@ -4098,40 +10832,40 @@ static void __Pyx_AddTraceback(const char *funcname, int c_line, return val; } #endif - return (unsigned int) -1; + return (int) -1; } } else { - unsigned int val; + int val; PyObject *tmp = __Pyx_PyNumber_IntOrLong(x); - if (!tmp) return (unsigned int) -1; - val = __Pyx_PyInt_As_unsigned_int(tmp); + if (!tmp) return (int) -1; + val = __Pyx_PyInt_As_int(tmp); Py_DECREF(tmp); return val; } raise_overflow: PyErr_SetString(PyExc_OverflowError, - "value too large to convert to unsigned int"); - return (unsigned int) -1; + "value too large to convert to int"); + return (int) -1; raise_neg_overflow: PyErr_SetString(PyExc_OverflowError, - "can't convert negative value to unsigned int"); - return (unsigned int) -1; + "can't convert negative value to int"); + return (int) -1; } /* CIntFromPy */ - static CYTHON_INLINE int __Pyx_PyInt_As_int(PyObject *x) { - const int neg_one = (int) -1, const_zero = (int) 0; + static CYTHON_INLINE unsigned int __Pyx_PyInt_As_unsigned_int(PyObject *x) { + const unsigned int neg_one = (unsigned int) -1, const_zero = (unsigned int) 0; const int is_unsigned = neg_one > const_zero; #if PY_MAJOR_VERSION < 3 if (likely(PyInt_Check(x))) { - if (sizeof(int) < sizeof(long)) { - __PYX_VERIFY_RETURN_INT(int, long, PyInt_AS_LONG(x)) + if (sizeof(unsigned int) < sizeof(long)) { + __PYX_VERIFY_RETURN_INT(unsigned int, long, PyInt_AS_LONG(x)) } else { long val = PyInt_AS_LONG(x); if (is_unsigned && unlikely(val < 0)) { goto raise_neg_overflow; } - return (int) val; + return (unsigned int) val; } } else #endif @@ -4140,32 +10874,32 @@ static void __Pyx_AddTraceback(const char *funcname, int c_line, #if CYTHON_USE_PYLONG_INTERNALS const digit* digits = ((PyLongObject*)x)->ob_digit; switch (Py_SIZE(x)) { - case 0: return (int) 0; - case 1: __PYX_VERIFY_RETURN_INT(int, digit, digits[0]) + case 0: return (unsigned int) 0; + case 1: __PYX_VERIFY_RETURN_INT(unsigned int, digit, digits[0]) case 2: - if (8 * sizeof(int) > 1 * PyLong_SHIFT) { + if (8 * sizeof(unsigned int) > 1 * PyLong_SHIFT) { if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) { - __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) - } else if (8 * sizeof(int) >= 2 * PyLong_SHIFT) { - return (int) (((((int)digits[1]) << PyLong_SHIFT) | (int)digits[0])); + __PYX_VERIFY_RETURN_INT(unsigned int, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) + } else if (8 * sizeof(unsigned int) >= 2 * PyLong_SHIFT) { + return (unsigned int) (((((unsigned int)digits[1]) << PyLong_SHIFT) | (unsigned int)digits[0])); } } break; case 3: - if (8 * sizeof(int) > 2 * PyLong_SHIFT) { + if (8 * sizeof(unsigned int) > 2 * PyLong_SHIFT) { if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) { - __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) - } else if (8 * sizeof(int) >= 3 * PyLong_SHIFT) { - return (int) (((((((int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0])); + __PYX_VERIFY_RETURN_INT(unsigned int, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) + } else if (8 * sizeof(unsigned int) >= 3 * PyLong_SHIFT) { + return (unsigned int) (((((((unsigned int)digits[2]) << PyLong_SHIFT) | (unsigned int)digits[1]) << PyLong_SHIFT) | (unsigned int)digits[0])); } } break; case 4: - if (8 * sizeof(int) > 3 * PyLong_SHIFT) { + if (8 * sizeof(unsigned int) > 3 * PyLong_SHIFT) { if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) { - __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) - } else if (8 * sizeof(int) >= 4 * PyLong_SHIFT) { - return (int) (((((((((int)digits[3]) << PyLong_SHIFT) | (int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0])); + __PYX_VERIFY_RETURN_INT(unsigned int, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) + } else if (8 * sizeof(unsigned int) >= 4 * PyLong_SHIFT) { + return (unsigned int) (((((((((unsigned int)digits[3]) << PyLong_SHIFT) | (unsigned int)digits[2]) << PyLong_SHIFT) | (unsigned int)digits[1]) << PyLong_SHIFT) | (unsigned int)digits[0])); } } break; @@ -4179,83 +10913,83 @@ static void __Pyx_AddTraceback(const char *funcname, int c_line, { int result = PyObject_RichCompareBool(x, Py_False, Py_LT); if (unlikely(result < 0)) - return (int) -1; + return (unsigned int) -1; if (unlikely(result == 1)) goto raise_neg_overflow; } #endif - if (sizeof(int) <= sizeof(unsigned long)) { - __PYX_VERIFY_RETURN_INT_EXC(int, unsigned long, PyLong_AsUnsignedLong(x)) - } else if (sizeof(int) <= sizeof(unsigned PY_LONG_LONG)) { - __PYX_VERIFY_RETURN_INT_EXC(int, unsigned PY_LONG_LONG, PyLong_AsUnsignedLongLong(x)) + if (sizeof(unsigned int) <= sizeof(unsigned long)) { + __PYX_VERIFY_RETURN_INT_EXC(unsigned int, unsigned long, PyLong_AsUnsignedLong(x)) + } else if (sizeof(unsigned int) <= sizeof(unsigned PY_LONG_LONG)) { + __PYX_VERIFY_RETURN_INT_EXC(unsigned int, unsigned PY_LONG_LONG, PyLong_AsUnsignedLongLong(x)) } } else { #if CYTHON_USE_PYLONG_INTERNALS const digit* digits = ((PyLongObject*)x)->ob_digit; switch (Py_SIZE(x)) { - case 0: return (int) 0; - case -1: __PYX_VERIFY_RETURN_INT(int, sdigit, (sdigit) (-(sdigit)digits[0])) - case 1: __PYX_VERIFY_RETURN_INT(int, digit, +digits[0]) + case 0: return (unsigned int) 0; + case -1: __PYX_VERIFY_RETURN_INT(unsigned int, sdigit, (sdigit) (-(sdigit)digits[0])) + case 1: __PYX_VERIFY_RETURN_INT(unsigned int, digit, +digits[0]) case -2: - if (8 * sizeof(int) - 1 > 1 * PyLong_SHIFT) { + if (8 * sizeof(unsigned int) - 1 > 1 * PyLong_SHIFT) { if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) { - __PYX_VERIFY_RETURN_INT(int, long, -(long) (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) - } else if (8 * sizeof(int) - 1 > 2 * PyLong_SHIFT) { - return (int) (((int)-1)*(((((int)digits[1]) << PyLong_SHIFT) | (int)digits[0]))); + __PYX_VERIFY_RETURN_INT(unsigned int, long, -(long) (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) + } else if (8 * sizeof(unsigned int) - 1 > 2 * PyLong_SHIFT) { + return (unsigned int) (((unsigned int)-1)*(((((unsigned int)digits[1]) << PyLong_SHIFT) | (unsigned int)digits[0]))); } } break; case 2: - if (8 * sizeof(int) > 1 * PyLong_SHIFT) { + if (8 * sizeof(unsigned int) > 1 * PyLong_SHIFT) { if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) { - __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) - } else if (8 * sizeof(int) - 1 > 2 * PyLong_SHIFT) { - return (int) ((((((int)digits[1]) << PyLong_SHIFT) | (int)digits[0]))); + __PYX_VERIFY_RETURN_INT(unsigned int, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) + } else if (8 * sizeof(unsigned int) - 1 > 2 * PyLong_SHIFT) { + return (unsigned int) ((((((unsigned int)digits[1]) << PyLong_SHIFT) | (unsigned int)digits[0]))); } } break; case -3: - if (8 * sizeof(int) - 1 > 2 * PyLong_SHIFT) { + if (8 * sizeof(unsigned int) - 1 > 2 * PyLong_SHIFT) { if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) { - __PYX_VERIFY_RETURN_INT(int, long, -(long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) - } else if (8 * sizeof(int) - 1 > 3 * PyLong_SHIFT) { - return (int) (((int)-1)*(((((((int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0]))); + __PYX_VERIFY_RETURN_INT(unsigned int, long, -(long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) + } else if (8 * sizeof(unsigned int) - 1 > 3 * PyLong_SHIFT) { + return (unsigned int) (((unsigned int)-1)*(((((((unsigned int)digits[2]) << PyLong_SHIFT) | (unsigned int)digits[1]) << PyLong_SHIFT) | (unsigned int)digits[0]))); } } break; case 3: - if (8 * sizeof(int) > 2 * PyLong_SHIFT) { + if (8 * sizeof(unsigned int) > 2 * PyLong_SHIFT) { if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) { - __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) - } else if (8 * sizeof(int) - 1 > 3 * PyLong_SHIFT) { - return (int) ((((((((int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0]))); + __PYX_VERIFY_RETURN_INT(unsigned int, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) + } else if (8 * sizeof(unsigned int) - 1 > 3 * PyLong_SHIFT) { + return (unsigned int) ((((((((unsigned int)digits[2]) << PyLong_SHIFT) | (unsigned int)digits[1]) << PyLong_SHIFT) | (unsigned int)digits[0]))); } } break; case -4: - if (8 * sizeof(int) - 1 > 3 * PyLong_SHIFT) { + if (8 * sizeof(unsigned int) - 1 > 3 * PyLong_SHIFT) { if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) { - __PYX_VERIFY_RETURN_INT(int, long, -(long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) - } else if (8 * sizeof(int) - 1 > 4 * PyLong_SHIFT) { - return (int) (((int)-1)*(((((((((int)digits[3]) << PyLong_SHIFT) | (int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0]))); + __PYX_VERIFY_RETURN_INT(unsigned int, long, -(long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) + } else if (8 * sizeof(unsigned int) - 1 > 4 * PyLong_SHIFT) { + return (unsigned int) (((unsigned int)-1)*(((((((((unsigned int)digits[3]) << PyLong_SHIFT) | (unsigned int)digits[2]) << PyLong_SHIFT) | (unsigned int)digits[1]) << PyLong_SHIFT) | (unsigned int)digits[0]))); } } break; case 4: - if (8 * sizeof(int) > 3 * PyLong_SHIFT) { + if (8 * sizeof(unsigned int) > 3 * PyLong_SHIFT) { if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) { - __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) - } else if (8 * sizeof(int) - 1 > 4 * PyLong_SHIFT) { - return (int) ((((((((((int)digits[3]) << PyLong_SHIFT) | (int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0]))); + __PYX_VERIFY_RETURN_INT(unsigned int, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) + } else if (8 * sizeof(unsigned int) - 1 > 4 * PyLong_SHIFT) { + return (unsigned int) ((((((((((unsigned int)digits[3]) << PyLong_SHIFT) | (unsigned int)digits[2]) << PyLong_SHIFT) | (unsigned int)digits[1]) << PyLong_SHIFT) | (unsigned int)digits[0]))); } } break; } #endif - if (sizeof(int) <= sizeof(long)) { - __PYX_VERIFY_RETURN_INT_EXC(int, long, PyLong_AsLong(x)) - } else if (sizeof(int) <= sizeof(PY_LONG_LONG)) { - __PYX_VERIFY_RETURN_INT_EXC(int, PY_LONG_LONG, PyLong_AsLongLong(x)) + if (sizeof(unsigned int) <= sizeof(long)) { + __PYX_VERIFY_RETURN_INT_EXC(unsigned int, long, PyLong_AsLong(x)) + } else if (sizeof(unsigned int) <= sizeof(PY_LONG_LONG)) { + __PYX_VERIFY_RETURN_INT_EXC(unsigned int, PY_LONG_LONG, PyLong_AsLongLong(x)) } } { @@ -4263,7 +10997,7 @@ static void __Pyx_AddTraceback(const char *funcname, int c_line, PyErr_SetString(PyExc_RuntimeError, "_PyLong_AsByteArray() not available in PyPy, cannot convert large numbers"); #else - int val; + unsigned int val; PyObject *v = __Pyx_PyNumber_IntOrLong(x); #if PY_MAJOR_VERSION < 3 if (likely(v) && !PyLong_Check(v)) { @@ -4283,55 +11017,28 @@ static void __Pyx_AddTraceback(const char *funcname, int c_line, return val; } #endif - return (int) -1; + return (unsigned int) -1; } } else { - int val; + unsigned int val; PyObject *tmp = __Pyx_PyNumber_IntOrLong(x); - if (!tmp) return (int) -1; - val = __Pyx_PyInt_As_int(tmp); + if (!tmp) return (unsigned int) -1; + val = __Pyx_PyInt_As_unsigned_int(tmp); Py_DECREF(tmp); return val; } raise_overflow: PyErr_SetString(PyExc_OverflowError, - "value too large to convert to int"); - return (int) -1; + "value too large to convert to unsigned int"); + return (unsigned int) -1; raise_neg_overflow: PyErr_SetString(PyExc_OverflowError, - "can't convert negative value to int"); - return (int) -1; -} - -/* CIntToPy */ - static CYTHON_INLINE PyObject* __Pyx_PyInt_From_long(long value) { - const long neg_one = (long) -1, const_zero = (long) 0; - const int is_unsigned = neg_one > const_zero; - if (is_unsigned) { - if (sizeof(long) < sizeof(long)) { - return PyInt_FromLong((long) value); - } else if (sizeof(long) <= sizeof(unsigned long)) { - return PyLong_FromUnsignedLong((unsigned long) value); - } else if (sizeof(long) <= sizeof(unsigned PY_LONG_LONG)) { - return PyLong_FromUnsignedLongLong((unsigned PY_LONG_LONG) value); - } - } else { - if (sizeof(long) <= sizeof(long)) { - return PyInt_FromLong((long) value); - } else if (sizeof(long) <= sizeof(PY_LONG_LONG)) { - return PyLong_FromLongLong((PY_LONG_LONG) value); - } - } - { - int one = 1; int little = (int)*(unsigned char *)&one; - unsigned char *bytes = (unsigned char *)&value; - return _PyLong_FromByteArray(bytes, sizeof(long), - little, !is_unsigned); - } + "can't convert negative value to unsigned int"); + return (unsigned int) -1; } /* CIntFromPy */ - static CYTHON_INLINE long __Pyx_PyInt_As_long(PyObject *x) { + static CYTHON_INLINE long __Pyx_PyInt_As_long(PyObject *x) { const long neg_one = (long) -1, const_zero = (long) 0; const int is_unsigned = neg_one > const_zero; #if PY_MAJOR_VERSION < 3 @@ -4516,7 +11223,7 @@ static void __Pyx_AddTraceback(const char *funcname, int c_line, } /* CheckBinaryVersion */ - static int __Pyx_check_binary_version(void) { + static int __Pyx_check_binary_version(void) { char ctversion[4], rtversion[4]; PyOS_snprintf(ctversion, 4, "%d.%d", PY_MAJOR_VERSION, PY_MINOR_VERSION); PyOS_snprintf(rtversion, 4, "%s", Py_GetVersion()); @@ -4532,7 +11239,7 @@ static void __Pyx_AddTraceback(const char *funcname, int c_line, } /* InitStrings */ - static int __Pyx_InitStrings(__Pyx_StringTabEntry *t) { + static int __Pyx_InitStrings(__Pyx_StringTabEntry *t) { while (t->p) { #if PY_MAJOR_VERSION < 3 if (t->is_unicode) { diff --git a/gumbocy.pxd b/gumbocy.pxd index 5ed3e33..724edba 100644 --- a/gumbocy.pxd +++ b/gumbocy.pxd @@ -1,5 +1,6 @@ # https://github.com/google/gumbo-parser/blob/master/src/gumbo.h + cdef extern from "gumbo.h": ctypedef enum GumboNamespaceEnum: @@ -187,7 +188,7 @@ cdef extern from "gumbo.h": GUMBO_INSERTION_FROM_ISINDEX = 1 << 5, GUMBO_INSERTION_FROM_IMAGE = 1 << 6, - + GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT = 1 << 7, GUMBO_INSERTION_ADOPTION_AGENCY_CLONED = 1 << 8, diff --git a/gumbocy.pyx b/gumbocy.pyx index cd8a691..ba5f15f 100644 --- a/gumbocy.pyx +++ b/gumbocy.pyx @@ -1,38 +1,576 @@ import re cimport gumbocy +cimport re2cy from libcpp.unordered_set cimport unordered_set +from cython.operator cimport dereference as deref +from libcpp.vector cimport vector +from libcpp.map cimport map cdef extern from "stdio.h": int printf(const char* format, ...); +cdef vector[re2cy.ArgPtr] *argp = new vector[re2cy.ArgPtr]() +cdef re2cy.ArgPtr *empty_args = &(deref(argp)[0]) + +cdef bint re2_search(const char* s, re2cy.RE2 &pattern): + return re2cy.RE2.PartialMatchN(s, pattern, empty_args, 0) + +cdef re2cy.RE2 *_RE2_SEARCH_STYLE_HIDDEN = new re2cy.RE2(r"(display\s*\:\s*none)|(visibility\s*\:\s*hidden)") +cdef re2cy.RE2 *_RE2_EXTERNAL_HREF = new re2cy.RE2(r"^(?:[A-Za-z0-9\+\.\-]+\:)?\/\/") +cdef re2cy.RE2 *_RE2_IGNORED_HREF = new re2cy.RE2(r"^(?:javascript|mailto|ftp|about)\:") _RE_SPLIT_WHITESPACE = re.compile(r"\s+") +ctypedef enum AttributeNames: + ATTR_ID, + ATTR_ROLE, + ATTR_HREF, + ATTR_STYLE, + ATTR_REL, + ATTR_SRC, + ATTR_ALT, + ATTR_NAME, + ATTR_PROPERTY, + ATTR_CONTENT + +# ATTR_ID = 0 +# ATTR_ROLE = 1 +# ATTR_HREF = 2 +# ATTR_STYLE = 3 +# ATTR_REL = 4 +# ATTR_SRC = 5 +# ATTR_ALT = 6 +# ATTR_NAME = 7 +# ATTR_PROPERTY = 8 +# ATTR_CONTENT = 9 + +# cdef struct Attributes: +# int size_classes +# vector[char*] classes +# bint has_hidden +# map[AttributeNames, const char*] values + +cdef class Attributes: + cdef int size_classes + cdef dict values + # cdef map[AttributeNames, const char*] values + # cdef const char* values[10] + # cdef vector[char*] classes + cdef list classes + cdef bint has_hidden + +# ctypedef sAttributes Attributes cdef class HTMLParser: - cdef char* html - cdef gumbocy.GumboOutput* output - cdef list nodes - + # Global parser variables cdef int nesting_limit cdef bint head_only cdef bint has_ids_ignore cdef bint has_classes_ignore + cdef bint has_ids_hidden + cdef bint has_classes_hidden cdef bint has_attributes_whitelist + cdef bint has_classes_boilerplate + cdef bint has_ids_boilerplate + cdef bint has_roles_boilerplate + cdef bint has_metas_whitelist + cdef unordered_set[int] tags_ignore cdef unordered_set[int] tags_ignore_head_only + cdef unordered_set[int] tags_boilerplate + cdef unordered_set[int] tags_boilerplate_bypass + cdef unordered_set[int] tags_separators + + cdef re2cy.RE2* attributes_whitelist + cdef re2cy.RE2* metas_whitelist + cdef re2cy.RE2* classes_ignore + cdef re2cy.RE2* ids_ignore + cdef re2cy.RE2* classes_hidden + cdef re2cy.RE2* ids_hidden + cdef re2cy.RE2* classes_boilerplate + cdef re2cy.RE2* ids_boilerplate + cdef re2cy.RE2* roles_boilerplate + + cdef bint analyze_internal_hyperlinks + cdef bint analyze_external_hyperlinks + cdef bint analyze_word_groups + + # Variables reinitialized at each parse() + cdef list current_stack + + cdef dict analysis + + cdef object current_word_group + cdef object current_hyperlink + + cdef bint has_output + cdef gumbocy.GumboOutput* output + cdef list nodes + + def __cinit__(self, dict options=None): + + options = options or {} + + self.nesting_limit = options.get("nesting_limit", 999) + self.head_only = options.get("head_only") + + self.analyze_external_hyperlinks = bool(options.get("analyze_external_hyperlinks", True)) + self.analyze_internal_hyperlinks = bool(options.get("analyze_internal_hyperlinks", True)) + self.analyze_word_groups = bool(options.get("analyze_word_groups", True)) + + attributes_whitelist = set(options.get("attributes_whitelist") or []) + + classes_ignore = frozenset(options.get("classes_ignore") or []) + if len(classes_ignore) > 0: + self.has_classes_ignore = True + self.classes_ignore = new re2cy.RE2("^(?:" + "|".join(classes_ignore) + ")$") + attributes_whitelist.add("class") + + ids_ignore = frozenset(options.get("ids_ignore") or []) + if len(ids_ignore) > 0: + self.has_ids_ignore = True + self.ids_ignore = new re2cy.RE2("^(?:" + "|".join(ids_ignore) + ")$") + attributes_whitelist.add("id") + + classes_hidden = frozenset(options.get("classes_hidden") or []) + if len(classes_hidden) > 0: + self.has_classes_hidden = True + self.classes_hidden = new re2cy.RE2("^(?:" + "|".join(classes_hidden) + ")$") + attributes_whitelist.add("class") + + ids_hidden = frozenset(options.get("ids_hidden") or []) + if len(ids_hidden) > 0: + self.has_ids_hidden = True + self.ids_hidden = new re2cy.RE2("^(?:" + "|".join(ids_hidden) + ")$") + attributes_whitelist.add("id") + + classes_boilerplate = frozenset(options.get("classes_boilerplate") or []) + if len(classes_boilerplate) > 0: + self.has_classes_boilerplate = True + self.classes_boilerplate = new re2cy.RE2("^(?:" + "|".join(classes_boilerplate) + ")$") + attributes_whitelist.add("class") + + ids_boilerplate = frozenset(options.get("ids_boilerplate") or []) + if len(ids_boilerplate) > 0: + self.has_ids_boilerplate = True + self.ids_boilerplate = new re2cy.RE2("^(?:" + "|".join(ids_boilerplate) + ")$") + attributes_whitelist.add("id") + + roles_boilerplate = frozenset(options.get("roles_boilerplate") or []) + if len(roles_boilerplate) > 0: + self.has_roles_boilerplate = True + self.roles_boilerplate = new re2cy.RE2("^(?:" + "|".join(roles_boilerplate) + ")$") + attributes_whitelist.add("role") + + metas_whitelist = frozenset(options.get("metas_whitelist") or []) + if len(metas_whitelist) > 0: + self.has_metas_whitelist = True + self.metas_whitelist = new re2cy.RE2("^(?:" + "|".join(metas_whitelist) + ")$") + attributes_whitelist.add("name") + attributes_whitelist.add("property") + attributes_whitelist.add("content") + + # Some options add attributes to the whitelist + if self.analyze_external_hyperlinks or self.analyze_internal_hyperlinks: + attributes_whitelist.add("href") + attributes_whitelist.add("rel") + + # FInally, freeze the attributes whitelist + self.has_attributes_whitelist = len(attributes_whitelist) > 0 + if self.has_attributes_whitelist: + self.attributes_whitelist = new re2cy.RE2("^(?:" + "|".join(attributes_whitelist) + ")$") + + self.tags_ignore_head_only.insert(gumbocy.GUMBO_TAG_BODY) + self.tags_ignore_head_only.insert(gumbocy.GUMBO_TAG_P) + self.tags_ignore_head_only.insert(gumbocy.GUMBO_TAG_DIV) + self.tags_ignore_head_only.insert(gumbocy.GUMBO_TAG_SPAN) + + for tag_name in options.get("tags_ignore", []): + tag = gumbocy.gumbo_tag_enum(tag_name) + if tag != gumbocy.GUMBO_TAG_UNKNOWN: + self.tags_ignore.insert( gumbocy.gumbo_tag_enum(tag_name)) + + for tag_name in options.get("tags_boilerplate", []): + tag = gumbocy.gumbo_tag_enum(tag_name) + if tag != gumbocy.GUMBO_TAG_UNKNOWN: + self.tags_boilerplate.insert( gumbocy.gumbo_tag_enum(tag_name)) + + for tag_name in options.get("tags_boilerplate_bypass", []): + tag = gumbocy.gumbo_tag_enum(tag_name) + if tag != gumbocy.GUMBO_TAG_UNKNOWN: + self.tags_boilerplate_bypass.insert( gumbocy.gumbo_tag_enum(tag_name)) + + for tag_name in options.get("tags_separators", []): + tag = gumbocy.gumbo_tag_enum(tag_name) + if tag != gumbocy.GUMBO_TAG_UNKNOWN: + self.tags_separators.insert( gumbocy.gumbo_tag_enum(tag_name)) + + self.tags_separators.insert(gumbocy.GUMBO_TAG_BODY) + + cdef bint guess_node_hidden(self, gumbocy.GumboNode* node, Attributes attrs): + """ Rough guess to check if the element is explicitly hidden. + + Not intended to combat spam! + """ + + if not self.has_attributes_whitelist: + return False + + # From the HTML5 spec + if attrs.has_hidden: + return True + + if self.has_ids_hidden and attrs.values.get(ATTR_ID): + if re2_search(attrs.values[ATTR_ID], deref(self.ids_hidden)): + return True + + if self.has_classes_hidden and attrs.size_classes > 0: + for k in attrs.classes: + if re2_search(k, deref(self.classes_hidden)): + return True + + if attrs.values.get(ATTR_STYLE): + if re2_search(attrs.values[ATTR_STYLE], deref(_RE2_SEARCH_STYLE_HIDDEN)): + return True + + return False + - cdef frozenset classes_ignore - cdef frozenset attributes_whitelist - cdef frozenset ids_ignore + cdef bint guess_node_boilerplate(self, gumbocy.GumboNode* node, Attributes attrs): + """ Rough guess to check if the element is boilerplate """ + if self.tags_boilerplate.count( node.v.element.tag): + return True - def __cinit__(self, char* html): - self.html = html + # http://html5doctor.com/understanding-aside/ + if node.v.element.tag == gumbocy.GUMBO_TAG_ASIDE: + if "article" not in self.current_stack: + return True - cdef bint _traverse_node(self, int level, gumbocy.GumboNode* node): + if self.has_classes_boilerplate and attrs.size_classes > 0: + for k in attrs.classes: + if re2_search(k, deref(self.classes_boilerplate)): + return True + + if self.has_ids_boilerplate and attrs.values.get(ATTR_ID): + if re2_search(attrs.values[ATTR_ID], deref(self.ids_boilerplate)): + return True + + if self.has_roles_boilerplate and attrs.values.get(ATTR_ROLE): + if re2_search(attrs.values[ATTR_ROLE], deref(self.roles_boilerplate)): + return True + + return False + + cdef Attributes get_attributes(self, gumbocy.GumboNode* node): + """ Build a dict with all the whitelisted attributes """ + + attrs = Attributes() + # cdef Attributes attrs + attrs.size_classes = 0 + attrs.has_hidden = 0 + # attrs.values = [""] * 10 + # attrs.classes = [] + attrs.values = {} # deref(new map[AttributeNames, const char*]()) + # attrs.values[ATTR_ID] = "x" + # print dict(attrs.values) + + for i in range(node.v.element.attributes.length): + + attr = node.v.element.attributes.data[i] + + if re2_search(attr.name, deref(self.attributes_whitelist)): + + if attr.name == b"class": + multiple_value = frozenset(_RE_SPLIT_WHITESPACE.split(attr.value.strip().lower())) + attrs.size_classes = len(multiple_value) + if attrs.size_classes > 0: + attrs.classes = list(multiple_value) + # for k in multiple_value: + # ck = k + # attrs.classes.push_back(ck) # = list(multiple_value) + + elif attr.name == b"id": + pystr = str(attr.value).lower() + attrs.values[ATTR_ID] = pystr + + elif attr.name == b"style": + attrs.values[ATTR_STYLE] = attr.value + + elif attr.name == b"href": + attrs.values[ATTR_HREF] = attr.value + + elif attr.name == b"role": + pystr = str(attr.value).lower() + attrs.values[ATTR_ROLE] = pystr + + elif attr.name == b"rel": + pystr = str(attr.value).lower() + attrs.values[ATTR_REL] = pystr + + elif attr.name == b"aria-hidden" and attr.value == b"true": + attrs.has_hidden = 1 + + elif attr.name == b"hidden": + attrs.has_hidden = 1 + + elif attr.name == b"alt": + attrs.values[ATTR_ALT] = attr.value + + elif attr.name == b"src": + attrs.values[ATTR_SRC] = attr.value + + elif attr.name == b"name": + pystr = str(attr.value).lower() + attrs.values[ATTR_NAME] = pystr + + elif attr.name == b"property": + pystr = str(attr.value).lower() + attrs.values[ATTR_PROPERTY] = pystr + + elif attr.name == b"content": + attrs.values[ATTR_CONTENT] = attr.value + + return attrs + + cdef void close_word_group(self): + """ Close the current word group """ + + if self.current_word_group: + self.analysis["word_groups"].append(tuple(self.current_word_group)) + self.current_word_group = None + + + cdef void add_text(self, text): + + if not self.current_word_group: + self.current_word_group = [text.strip(), self.current_stack[-1]] + else: + self.current_word_group[0] += " " + text.strip() + + cdef void add_hyperlink_text(self, text): + if self.current_hyperlink: + self.current_hyperlink[1] += text + + cdef void open_hyperlink(self, Attributes attrs): + + if not attrs.values.get(ATTR_HREF): + return + + if len(attrs.values[ATTR_HREF]) == 0: + return + + if re2_search(attrs.values[ATTR_HREF], deref(_RE2_IGNORED_HREF)): + return + + self.close_hyperlink() + self.current_hyperlink = [attrs.values[ATTR_HREF], ""] + + cdef void close_hyperlink(self): + if self.current_hyperlink: + href = self.current_hyperlink[0] + + # TODO: absolute links to same domain + if re2_search(href, deref(_RE2_EXTERNAL_HREF)): + if self.analyze_external_hyperlinks: + if href.startswith("http://") or href.startswith("https://") or href.startswith("//"): + self.analysis["external_hyperlinks"].append(tuple(self.current_hyperlink)) + else: + if self.analyze_internal_hyperlinks: + self.analysis["internal_hyperlinks"].append(tuple(self.current_hyperlink)) + + self.current_hyperlink = None + + cdef bint _traverse_node(self, int level, gumbocy.GumboNode* node, bint is_head, bint is_hidden, bint is_boilerplate, bint is_boilerplate_bypassed, bint is_hyperlink): + """ Traverses the node tree. Return 1 to stop at this level """ + + cdef GumboStringPiece gsp + cdef const char* tag_name + cdef int tag_n + + if level > self.nesting_limit: + return 0 + + if node.type == gumbocy.GUMBO_NODE_TEXT: + + if (self.analyze_internal_hyperlinks or self.analyze_external_hyperlinks) and is_hyperlink: + self.add_hyperlink_text(node.v.text.text) + + if self.analyze_word_groups and not is_head and not is_hidden and (not is_boilerplate or is_boilerplate_bypassed): + self.add_text(node.v.text.text) + + elif node.type == gumbocy.GUMBO_NODE_ELEMENT: + + tag_n = node.v.element.tag + + if self.head_only and self.tags_ignore_head_only.count(tag_n): + return 1 + + if self.tags_ignore.count(tag_n): + return 0 + + tag_name = gumbocy.gumbo_normalized_tagname(node.v.element.tag) + + # When we find an unknown tag, find its tag_name in the buffer + if tag_name == b"": + gsp = node.v.element.original_tag + gumbo_tag_from_original_text(&gsp) + py_tag_name = str(gsp.data)[0:gsp.length].lower() # TODO try to do that only in C! + tag_name = py_tag_name + + # if self.has_attributes_whitelist: + + attrs = self.get_attributes(node) + + if self.has_classes_ignore and attrs.size_classes > 0: + for v in attrs.classes: + if re2_search(v, deref(self.classes_ignore)): + return 0 + + if self.has_ids_ignore and attrs.values.get(ATTR_ID): + if re2_search(attrs.values[ATTR_ID], deref(self.ids_ignore)): + return 0 + + if node.v.element.tag == gumbocy.GUMBO_TAG_TITLE: + if not self.analysis.get("title"): + if node.v.element.children.length > 0: + first_child = node.v.element.children.data[0] + if first_child.type == gumbocy.GUMBO_NODE_TEXT: + self.analysis["title"] = first_child.v.text.text + return 0 + + self.current_stack.append(tag_name) + + if node.v.element.tag == gumbocy.GUMBO_TAG_HEAD: + is_head = 1 + + elif node.v.element.tag == gumbocy.GUMBO_TAG_A: + self.open_hyperlink(attrs) + is_hyperlink = 1 + + elif node.v.element.tag == gumbocy.GUMBO_TAG_IMG: + self.close_word_group() + if attrs.values.get(ATTR_ALT): + self.add_text(attrs.values[ATTR_ALT]) + self.close_word_group() + + # Text extraction from image filenames disabled for now + # if attrs.get("src"): + # if not attrs["src"].startswith("data:"): + # self.add_text(self._split_filename_words(attrs["src"])) + # self.close_word_group() + + + if is_head: + if node.v.element.tag == gumbocy.GUMBO_TAG_LINK: + + # TODO: more properties + if attrs.values.get(ATTR_REL) and attrs.values.get(ATTR_HREF): + self.analysis.setdefault("head_links", []) + self.analysis["head_links"].append({"rel": attrs.values[ATTR_REL], "href": attrs.values[ATTR_HREF]}) + + elif self.has_metas_whitelist and node.v.element.tag == gumbocy.GUMBO_TAG_META: + + if attrs.values.get(ATTR_CONTENT): + + if attrs.values.get(ATTR_NAME): + if re2_search(attrs.values[ATTR_NAME], deref(self.metas_whitelist)): + self.analysis.setdefault("head_metas", {}) + self.analysis["head_metas"][attrs.values[ATTR_NAME]] = str(attrs.values[ATTR_CONTENT]).strip() + + elif attrs.values.get(ATTR_PROPERTY): + if re2_search(attrs.values[ATTR_PROPERTY], deref(self.metas_whitelist)): + self.analysis.setdefault("head_metas", {}) + self.analysis["head_metas"][attrs.values[ATTR_PROPERTY]] = str(attrs.values[ATTR_CONTENT]).strip() + + elif node.v.element.tag == gumbocy.GUMBO_TAG_BASE: + if attrs.values.get(ATTR_HREF) and "base_url" not in self.analysis: + self.analysis["base_url"] = attrs.values[ATTR_HREF] + + # TODO is_article + + if not is_hidden: + is_hidden = self.guess_node_hidden(node, attrs) + + if is_boilerplate and not is_boilerplate_bypassed: + if self.tags_boilerplate_bypass.count(tag_n): + is_boilerplate_bypassed = True + + if not is_boilerplate: + is_boilerplate = self.guess_node_boilerplate(node, attrs) + + # print " " * level, "BOILER", tag_name, is_boilerplate, dict(attrs.values), attrs.classes + + # Close the word group + if self.tags_separators.count(tag_n): + self.close_word_group() + + # Call _traverse_node() recursively for each of the children + for i in range(node.v.element.children.length): + child = node.v.element.children.data[i] + if self._traverse_node(level + 1, child, is_head, is_hidden, is_boilerplate, is_boilerplate_bypassed, is_hyperlink) == 1: + break + + # Close the word group + if self.tags_separators.count(tag_n): + self.close_word_group() + + self.current_stack.pop() + + if node.v.element.tag == gumbocy.GUMBO_TAG_A: + self.close_hyperlink() + + if node.v.element.tag == gumbocy.GUMBO_TAG_HEAD: + if self.head_only: + return 1 + + return 0 + + def parse(self, char* html): + """ Do the actual parsing of the HTML with gumbo """ + + self.free() + self.output = gumbocy.gumbo_parse(html) + self.has_output = 1 + + def analyze(self): + """ Traverse the parsed tree and return the results """ + + self.analysis = {} + + if self.analyze_internal_hyperlinks: + self.analysis["internal_hyperlinks"] = [] + + if self.analyze_external_hyperlinks: + self.analysis["external_hyperlinks"] = [] + + if self.analyze_word_groups: + self.analysis["word_groups"] = [] + + self.current_stack = [] + self.current_word_group = None + self.current_hyperlink = None + + self._traverse_node(0, self.output.root, 0, 0, 0, 0, 0) + + return self.analysis + + # + # Older listnodes() API support + # + + def listnodes(self): + """ Return the nodes as a flat list of tuples """ + + self.nodes = [] + + self._traverse_node_simple(0, self.output.root) + + return self.nodes + + cdef bint _traverse_node_simple(self, int level, gumbocy.GumboNode* node): """ Traverses the node tree. Return 1 to stop at this level """ cdef GumboStringPiece gsp @@ -70,13 +608,13 @@ cdef class HTMLParser: for i in range(node.v.element.attributes.length): attr = node.v.element.attributes.data[i] attr_name = str(attr.name) - if attr_name in self.attributes_whitelist: + if re2_search(attr_name, deref(self.attributes_whitelist)): if attr_name == b"class": multiple_value = frozenset(_RE_SPLIT_WHITESPACE.split(attr.value.strip().lower())) if len(multiple_value): if self.has_classes_ignore: for v in multiple_value: - if v in self.classes_ignore: + if re2_search(v, deref(self.classes_ignore)): return 0 if not has_attrs: @@ -97,7 +635,7 @@ cdef class HTMLParser: else: if self.has_ids_ignore: - if attrs.get("id") and attrs["id"].lower() in self.ids_ignore: + if attrs.get("id") and re2_search(attrs["id"].lower(), deref(self.ids_ignore)): return 0 self.nodes.append((level, tag_name, attrs)) @@ -108,7 +646,7 @@ cdef class HTMLParser: # Call _iternode() recursively for each of the children for i in range(node.v.element.children.length): child = node.v.element.children.data[i] - if self._traverse_node(level + 1, child) == 1: + if self._traverse_node_simple(level + 1, child) == 1: break if node.v.element.tag == gumbocy.GUMBO_TAG_HEAD and self.head_only: @@ -116,45 +654,11 @@ cdef class HTMLParser: return 0 - def parse(self): - """ Do the actual parsing of the HTML with gumbo """ - self.output = gumbocy.gumbo_parse(self.html) - - def listnodes(self, dict options=None): - """ Return the nodes as a flat list of tuples """ - - options = options or {} - self.nesting_limit = options.get("nesting_limit", 999) - self.head_only = options.get("head_only") - - self.has_classes_ignore = options.get("classes_ignore") - if self.has_classes_ignore: - self.classes_ignore = frozenset(options["classes_ignore"]) - - self.has_ids_ignore = options.get("ids_ignore") - if self.has_ids_ignore: - self.ids_ignore = frozenset(options["ids_ignore"]) - - self.has_attributes_whitelist = options.get("attributes_whitelist") - if self.has_attributes_whitelist: - self.attributes_whitelist = frozenset(options.get("attributes_whitelist") or []) - - self.tags_ignore_head_only.insert(gumbocy.GUMBO_TAG_BODY) - self.tags_ignore_head_only.insert(gumbocy.GUMBO_TAG_P) - self.tags_ignore_head_only.insert(gumbocy.GUMBO_TAG_DIV) - self.tags_ignore_head_only.insert(gumbocy.GUMBO_TAG_SPAN) - - for tag_name in options.get("tags_ignore", []): - tag = gumbocy.gumbo_tag_enum(tag_name) - if tag != gumbocy.GUMBO_TAG_UNKNOWN: - self.tags_ignore.insert( gumbocy.gumbo_tag_enum(tag_name)) - - self.nodes = [] - - self._traverse_node(0, self.output.root) - - return self.nodes - def __dealloc__(self): """ Cleanup gumbo memory when the parser is deallocated by Python """ - gumbocy.gumbo_destroy_output(&gumbocy.kGumboDefaultOptions, self.output) + self.free() + + cdef free(self): + if self.has_output: + gumbocy.gumbo_destroy_output(&gumbocy.kGumboDefaultOptions, self.output) + self.has_output = 0 diff --git a/re2cy.pxd b/re2cy.pxd new file mode 100644 index 0000000..5f44d17 --- /dev/null +++ b/re2cy.pxd @@ -0,0 +1,30 @@ +from libcpp.string cimport string + +ctypedef Arg* ArgPtr + + +cdef extern from "re2/stringpiece.h" namespace "re2": + cdef cppclass StringPiece: + # Eliding some constructors on purpose. + StringPiece(const char*) except + + StringPiece(const string&) except + + + const char* data() + int length() + + +cdef extern from "re2/re2.h" namespace "re2": + + cdef cppclass Arg "RE2::Arg": + Arg() + + cdef cppclass RE2: + RE2(const char*) except + + + @staticmethod + bint PartialMatchN( + const char *, + const RE2&, + const Arg* const args[], + int, + ) diff --git a/scripts/git-set-file-times b/scripts/git-set-file-times new file mode 100755 index 0000000..0fe1b2e --- /dev/null +++ b/scripts/git-set-file-times @@ -0,0 +1,51 @@ +#!/usr/bin/perl -w +use strict; + +# sets mtime and atime of files to the latest commit time in git +# +# This is useful for serving static content (managed by git) +# from a cluster of identically configured HTTP servers. HTTP +# clients and content delivery networks can get consistent +# Last-Modified headers no matter which HTTP server in the +# cluster they hit. This should improve caching behavior. +# +# This does not take into account merges, but if you're updating +# every machine in the cluster from the same commit (A) to the +# same commit (B), the mtimes will be _consistent_ across all +# machines if not necessarily accurate. +# +# THIS IS NOT INTENDED TO OPTIMIZE BUILD SYSTEMS SUCH AS 'make' +# YOU HAVE BEEN WARNED! + +my %ls = (); +my $commit_time; + +if ($ENV{GIT_DIR}) { + chdir($ENV{GIT_DIR}) or die $!; +} + +$/ = "\0"; +open FH, 'git ls-files -z|' or die $!; +while () { + chomp; + $ls{$_} = $_; +} +close FH; + + +$/ = "\n"; +open FH, "git log -m -r --name-only --no-color --pretty=raw -z @ARGV |" or die $!; +while () { + chomp; + if (/^committer .*? (\d+) (?:[\-\+]\d+)$/) { + $commit_time = $1; + } elsif (s/\0\0commit [a-f0-9]{40}( \(from [a-f0-9]{40}\))?$// or s/\0$//) { + my @files = delete @ls{split(/\0/, $_)}; + @files = grep { defined $_ } @files; + next unless @files; + utime $commit_time, $commit_time, @files; + } + last unless %ls; + +} +close FH; \ No newline at end of file diff --git a/setup.py b/setup.py index e7e8a9d..1d3bec6 100644 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ ext_modules = [ Extension("gumbocy", ["gumbocy.cpp"], - libraries=["gumbo"], + libraries=["gumbo", "re2"], language="c++", extra_compile_args=["-std=c++11", '-O3', '-static-libstdc++'], extra_link_args=["-std=c++11"]) # , "-static" @@ -36,7 +36,7 @@ setup( name="gumbocy", - version="0.1", + version="0.2.0", description="Python binding for gumbo-parser (an HTML5-compliant parser) using Cython", author="Common Search contributors", license="Apache License, Version 2.0", diff --git a/tests/benchmark_parsers.py b/tests/benchmark_parsers.py index cfa401a..12deb76 100644 --- a/tests/benchmark_parsers.py +++ b/tests/benchmark_parsers.py @@ -12,9 +12,15 @@ import gumbo import bs4 -url = 'https://raw.githubusercontent.com/whatwg/html/d8717d8831c276ca65d2d44bbf2ce4ce673997b9/source' -html = requests.get(url).content -html_unicode = html.decode("utf-8") +if not os.path.isfile("tests/_benchmark_fixture.html"): + url = 'https://raw.githubusercontent.com/whatwg/html/d8717d8831c276ca65d2d44bbf2ce4ce673997b9/source' + html = requests.get(url).content + with open("tests/_benchmark_fixture.html", "w") as f: + f.write(html) + +with open("tests/_benchmark_fixture.html", "r") as f: + html = f.read() + html_unicode = html.decode("utf-8") def bench(name, func): @@ -22,11 +28,11 @@ def bench(name, func): def benchmark_gumbocy(): - parser = gumbocy.HTMLParser(html) - parser.parse() - nodes = parser.listnodes(options={ + parser = gumbocy.HTMLParser(options={ "attributes_whitelist": ["id", "class", "style"] }) + parser.parse(html) + nodes = parser.listnodes() divs_count = 0 for node in nodes: diff --git a/tests/test_analyze.py b/tests/test_analyze.py new file mode 100644 index 0000000..d8d2d22 --- /dev/null +++ b/tests/test_analyze.py @@ -0,0 +1,135 @@ +import gumbocy +from test_word_groups import TAGS_SEPARATORS + + +def analyze(html, options=None): + parser = gumbocy.HTMLParser(options=options) + parser.parse(html) + return parser.analyze() + + +def test_separators(): + html = """ +

text

+

text 2

+

pre

inner

+ """ + + analyzed = analyze(html, options={ + "tags_separators": ["p"] + }) + + assert analyzed["word_groups"] == [ + ("text", "p"), + ("text 2", "p"), + ("pre", "p"), + ("inner", "p") + ] + + # More word group tests in test_word_groups.py + + +def test_hidden_text(): + + html = """ + + text +
textp
+
hidden by display
+
ignored by class_noindex
+
ignored by class_noindex 2
+ + +
not_aria
+
hidden by visibility
+ """ + + analyzed = analyze(html, options={ + "attributes_whitelist": ["style", "hidden", "aria-hidden"], + "classes_hidden": ["_class_hidden"], + "ids_hidden": ["_id_hidden"], + "tags_separators": ["div"], + "classes_ignore": ["_class_noindex"] + }) + + assert analyzed["word_groups"] == [ + ("text", "body"), + ("textp", "div"), + ("not_aria", "div") + ] + + +def test_hidden_siblings(): + + html = """ + + +""" + + analyzed = analyze(html, options={ + "classes_boilerplate": ["login"] + }) + + assert analyzed["word_groups"] == [] + + +def test_boilerplate_text(): + + html = """ + +
+ Boilerplate +

Title

+
+ +
x
+
y
+
z
+ +

Title 2

+ """ + + analyzed = analyze(html, options={ + "attributes_whitelist": ["id", "class", "role"], + "tags_boilerplate": ["header"], + "tags_boilerplate_bypass": ["h2"], + "classes_boilerplate": ["classboil"], + "ids_boilerplate": ["idboil"], + "roles_boilerplate": ["roleboil"], + "tags_separators": TAGS_SEPARATORS + }) + + assert analyzed["word_groups"] == [ + ("Title", "h2"), + ("Title 2", "h2") + ] + + +def test_title(): + + html = """ test 1 test 2 """ + + analyzed = analyze(html, options={ + }) + + assert analyzed["title"] == "test 1" + assert len(analyzed["word_groups"]) == 0 + + +def test_head_metas(): + + html = """ + + + + + This is <body> text + """ + + analyzed = analyze(html, options={ + "metas_whitelist": ["description"] + }) + + assert analyzed["head_metas"] == {"description": "This is a !"} diff --git a/tests/test_hyperlinks.py b/tests/test_hyperlinks.py new file mode 100644 index 0000000..5fb1e63 --- /dev/null +++ b/tests/test_hyperlinks.py @@ -0,0 +1,64 @@ +import gumbocy +from test_word_groups import TAGS_SEPARATORS + + +def _links(html, url=None): + parser = gumbocy.HTMLParser(options={ + "tags_separators": TAGS_SEPARATORS + }) + parser.parse(html) + ret = parser.analyze() + return ret["internal_hyperlinks"] + ret["external_hyperlinks"] + + +def test_get_hyperlinks(): + links = _links("""Test titlex""") + assert len(links) == 0 + + links = _links("""Test title + Y + """) + assert len(links) == 0 + + links = _links("""Test title + Y + """) + assert len(links) == 0 + + links = _links("""Test title + Y + """) + assert len(links) == 0 + + links = _links("""Test title + Y + """) + assert len(links) == 0 + + links = _links("""Test title + Y + """) + assert len(links) == 0 + + links = _links("""Test title + Y + """) + assert len(links) == 1 + assert links[0][0] == "http://sub.test.com/page1?q=2&a=b#xxx" + assert links[0][1] == "Y" + + links = _links("""Test title + Y X + """, url="http://sub.test.com/page2") + assert len(links) == 1 + assert links[0][0] == "/page1?q=2&a=b#xxx" + assert links[0][1] == "Y X" + + links = _links("""Test title + Y Z + """, url="http://sub.test.com/page2/x.html") + assert len(links) == 1 + assert links[0][0] == "../page1?q=2&a=b#xxx" + assert links[0][1] == "Y Z" + + # TODO resolution tests \ No newline at end of file diff --git a/tests/test_listnodes.py b/tests/test_listnodes.py index 7ecc0f3..05d33f0 100644 --- a/tests/test_listnodes.py +++ b/tests/test_listnodes.py @@ -2,9 +2,9 @@ def listnodes(html, options=None): - parser = gumbocy.HTMLParser(html) - parser.parse() - return parser.listnodes(options=options) + parser = gumbocy.HTMLParser(options=options) + parser.parse(html) + return parser.listnodes() def test_basic(): diff --git a/tests/test_word_groups.py b/tests/test_word_groups.py new file mode 100644 index 0000000..ffd218b --- /dev/null +++ b/tests/test_word_groups.py @@ -0,0 +1,134 @@ +import gumbocy +import pytest + +TAGS_SEPARATORS = frozenset([ + "body", + + # http://www.w3.org/TR/html5/grouping-content.html#grouping-content + "p", "pre", "blockquote", "ul", "ol", "li", "dl", "dt", "dd", "figure", "figcaption", + + "br", "img", + + "h1", "h2", "h3", "h4", "h5", "h6" +]) + + +SAMPLES = [ + { + "html": """

hello

""", + "groups": [ + ("hello", "p") + ] + }, + + # A is automatically added + { + "html": """ nobody """, + "groups": [ + ("nobody", "body") + ] + }, + + # span + { + "html": """

pre link post

""", + "groups": [ + ("pre link post", "p") + ] + }, + + # a + { + "html": """

pre link post

""", + "groups": [ + ("pre link post", "p") + ] + }, + + # mid p + { + "html": """

pre

  • li1 x
mid

post

""", + "groups": [ + ("pre", "p"), + ("li1 x", "li"), + ("mid", "body"), + ("post", "p") + ] + }, + + # Lists + { + "html": """ pre
  • li1
  • li2
post """, + "groups": [ + ("pre", "body"), + ("li1", "li"), + ("li2", "li"), + ("post", "body") + ] + }, + + # HR with illegal

. "post" is actually part of . + { + "html": """

pre


post

""", + "groups": [ + ("pre", "p"), + ("post", "body") + ] + }, + + # Non-closed p tag. + { + "html": """ pre

post""", + "groups": [ + ("pre", "body"), + ("post", "p") + ] + }, + + # BR + { + "html": """

pre
post

""", + "groups": [ + ("pre", "p"), + ("post", "p") + ] + }, + + # IMG filename + alt + { + "html": """

pre james brown post

""", + "groups": [ + ("pre", "p"), + ("james brown", "img"), + # ("maceo parker", "img"), + ("post", "p") + ] + }, + + # IMG with dataURIs are ignored + { + "html": """

pre Red dot post

""", + "groups": [ + ("pre", "p"), + ("Red dot", "img"), + ("post", "p") + ] + }, +] + + +# TODO: good coverage of http://www.w3.org/html/wg/drafts/html/master/syntax.html +@pytest.mark.parametrize(("sample"), SAMPLES) +def test_get_word_groups(sample): + + parser = gumbocy.HTMLParser(options={ + "tags_separators": TAGS_SEPARATORS, + "attributes_whitelist": ["src", "alt"] + }) + parser.parse(sample["html"]) + parsed = parser.analyze() + + for i, group in enumerate(parsed["word_groups"]): + assert group == sample["groups"][i] + + assert len(parsed["word_groups"]) == len(sample["groups"])