Module niteru.html_parser
Expand source code
from functools import lru_cache
from html.parser import HTMLParser
from typing import List, Optional, Tuple, cast
from niteru.dataclasses import ParsedHTML
class NiteruHTMLParser(HTMLParser):
def __init__(self):
super().__init__()
self.tags: List[str] = []
self.classes: List[str] = []
def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None:
self.tags.append(tag)
for attr in attrs:
key, value = attr
if key == "class" and value is not None:
self.classes.extend(value.split())
def handle_comment(self, _: str) -> None:
self.tags.append("comment")
def handle_decl(self, decl: str) -> None:
self.tags.append("declaration")
@lru_cache(maxsize=128)
def _parse_html(html: str):
parser = NiteruHTMLParser()
parser.feed(html)
return ParsedHTML(html=html, tags=parser.tags, classes=parser.classes)
def parse_html(html: str) -> ParsedHTML:
parsed = _parse_html(html)
return cast(ParsedHTML, parsed)
Functions
def parse_html(html: str) ‑> ParsedHTML
-
Expand source code
def parse_html(html: str) -> ParsedHTML: parsed = _parse_html(html) return cast(ParsedHTML, parsed)
Classes
class NiteruHTMLParser
-
Find tags and other markup and call handler functions.
Usage
p = HTMLParser() p.feed(data) … p.close()
Start tags are handled by calling self.handle_starttag() or self.handle_startendtag(); end tags by self.handle_endtag(). The data between tags is passed from the parser to the derived class by calling self.handle_data() with the data as argument (the data may be split up in arbitrary chunks). If convert_charrefs is True the character references are converted automatically to the corresponding Unicode character (and self.handle_data() is no longer split in chunks), otherwise they are passed by calling self.handle_entityref() or self.handle_charref() with the string containing respectively the named or numeric reference as the argument.
Initialize and reset this instance.
If convert_charrefs is True (the default), all character references are automatically converted to the corresponding Unicode characters.
Expand source code
class NiteruHTMLParser(HTMLParser): def __init__(self): super().__init__() self.tags: List[str] = [] self.classes: List[str] = [] def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None: self.tags.append(tag) for attr in attrs: key, value = attr if key == "class" and value is not None: self.classes.extend(value.split()) def handle_comment(self, _: str) -> None: self.tags.append("comment") def handle_decl(self, decl: str) -> None: self.tags.append("declaration")
Ancestors
- html.parser.HTMLParser
- _markupbase.ParserBase
Methods
def handle_comment(self, _: str) ‑> NoneType
-
Expand source code
def handle_comment(self, _: str) -> None: self.tags.append("comment")
def handle_decl(self, decl: str) ‑> NoneType
-
Expand source code
def handle_decl(self, decl: str) -> None: self.tags.append("declaration")
def handle_starttag(self, tag: str, attrs: List[Tuple[str, Union[str, NoneType]]]) ‑> NoneType
-
Expand source code
def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None: self.tags.append(tag) for attr in attrs: key, value = attr if key == "class" and value is not None: self.classes.extend(value.split())