Module `niteru.html_parser`

Expand source code

from functools import lru_cache
from html.parser import HTMLParser
from typing import List, Optional, Tuple, cast

from niteru.dataclasses import ParsedHTML


class NiteruHTMLParser(HTMLParser):
    def __init__(self):
        super().__init__()

        self.tags: List[str] = []
        self.classes: List[str] = []

    def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None:
        self.tags.append(tag)

        for attr in attrs:
            key, value = attr
            if key == "class" and value is not None:
                self.classes.extend(value.split())

    def handle_comment(self, _: str) -> None:
        self.tags.append("comment")

    def handle_decl(self, decl: str) -> None:
        self.tags.append("declaration")


@lru_cache(maxsize=128)
def _parse_html(html: str):
    parser = NiteruHTMLParser()
    parser.feed(html)

    return ParsedHTML(html=html, tags=parser.tags, classes=parser.classes)


def parse_html(html: str) -> ParsedHTML:
    parsed = _parse_html(html)
    return cast(ParsedHTML, parsed)

Functions

def parse_html(html: str) ‑> ParsedHTML

Expand source code

def parse_html(html: str) -> ParsedHTML:
    parsed = _parse_html(html)
    return cast(ParsedHTML, parsed)

Classes

class NiteruHTMLParser

Find tags and other markup and call handler functions.

Usage

p = HTMLParser() p.feed(data) … p.close()

Start tags are handled by calling self.handle_starttag() or self.handle_startendtag(); end tags by self.handle_endtag(). The data between tags is passed from the parser to the derived class by calling self.handle_data() with the data as argument (the data may be split up in arbitrary chunks). If convert_charrefs is True the character references are converted automatically to the corresponding Unicode character (and self.handle_data() is no longer split in chunks), otherwise they are passed by calling self.handle_entityref() or self.handle_charref() with the string containing respectively the named or numeric reference as the argument.

Initialize and reset this instance.

If convert_charrefs is True (the default), all character references are automatically converted to the corresponding Unicode characters.

Expand source code

class NiteruHTMLParser(HTMLParser):
    def __init__(self):
        super().__init__()

        self.tags: List[str] = []
        self.classes: List[str] = []

    def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None:
        self.tags.append(tag)

        for attr in attrs:
            key, value = attr
            if key == "class" and value is not None:
                self.classes.extend(value.split())

    def handle_comment(self, _: str) -> None:
        self.tags.append("comment")

    def handle_decl(self, decl: str) -> None:
        self.tags.append("declaration")

Ancestors

html.parser.HTMLParser
_markupbase.ParserBase

Methods

def handle_comment(self, _: str) ‑> NoneType

Expand source code

def handle_comment(self, _: str) -> None:
    self.tags.append("comment")

def handle_decl(self, decl: str) ‑> NoneType

Expand source code

def handle_decl(self, decl: str) -> None:
    self.tags.append("declaration")

def handle_starttag(self, tag: str, attrs: List[Tuple[str, Union[str, NoneType]]]) ‑> NoneType

Expand source code

def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None:
    self.tags.append(tag)

    for attr in attrs:
        key, value = attr
        if key == "class" and value is not None:
            self.classes.extend(value.split())