Package `niteru`

niteru

This package provides a set of functions to measure the similarity between HTMLs.

Note: This is a fork of html-similarity.

Key differences

Type hints
All functions have proper type hints
Dependency free
Works along with plain Python

Installation

pip install niteru

How it works

Structural Similarity

Uses sequence comparison of the html tags to compute the similarity.

We do not implement the similarity based on tree edit distance because it is slower than sequence comparison.

Style Similarity

Extracts CSS classes of each html document and calculates the jaccard similarity of the sets of classes.

Joint Similarity (Structural Similarity and Style Similarity)

The joint similarity metric is calculated as::

k * structural_similarity(html1, html2) + (1 - k) * style_similarity(html1, html2)

All the similarity metrics take values between 0.0 and 1.0.

Recommendations for joint similarity

Using k=0.3 gives better results. The style similarity gives more information about the similarity rather than the structural similarity.

Examples

Here is an example:

html1 = '''
<h1 class="title">First Document</h1>
<ul class="menu">
  <li class="active">Documents</li>
  <li>Extra</li>
</ul>
 '''

html2 = '''
<h1 class="title">Second document Document</h1>
<ul class="menu">
  <li class="active">Extra Documents</li>
</ul>
'''

from niteru import style_similarity, structural_similarity, similarity

style_similarity(html1, html2) # => 1.0
structural_similarity(html1, html2) # => 0.8571428571428571
similarity(html1, html2) # => 0.9285714285714286

Expand source code

"""
.. include:: ../README.md
"""
from niteru.similarity import similarity
from niteru.structural_similarity import structural_similarity
from niteru.style_similarity import style_similarity

__version__ = "0.1.0"

__all__ = ["structural_similarity", "style_similarity", "similarity"]

Sub-modules

niteru.dataclasses
niteru.html_parser
niteru.utils

Functions

def similarity(html1: str, html2: str, k: float = 0.5, use_quick_ratio: bool = False) ‑> float

Computes the joint similarity between two DOM trees

Args

html1 : str: HTML string
html2 : str: HTML string
k : float, optional: Weight of structural_similarity function as a float in the range 0.0 to 1.0. Defaults to 0.5.
use_quick_ratio : bool, optional: Whether to use difflib.SequenceMatcher.quick_ratio function for computing similarity in structural_similarity function or not. Use difflib.SequenceMatcher.ratio function by default. Defaults to False.

Returns

float: Similarity as a float in the range 0.0 to 1.0.

Expand source code

def similarity(
    html1: str, html2: str, k: float = 0.5, use_quick_ratio: bool = False
) -> float:
    """Computes the joint similarity between two DOM trees

    Args:
        html1 (str): HTML string
        html2 (str): HTML string
        k (float, optional): Weight of structural_similarity function as a float in the range 0.0 to 1.0. Defaults to 0.5.
        use_quick_ratio (bool, optional): Whether to use difflib.SequenceMatcher.quick_ratio function for computing similarity in structural_similarity function or not. Use difflib.SequenceMatcher.ratio function by default. Defaults to False.

    Returns:
        float: Similarity as a float in the range 0.0 to 1.0.
    """
    return k * structural_similarity(html1, html2) + (1 - k) * style_similarity(
        html1, html2
    )

def structural_similarity(html1: str, html2: str, use_quick_ratio: bool = False) ‑> float

Computes the structural similarity between two DOM trees

Args

html1 : str: HTML string
html2 : str: HTML string
use_quick_ratio : bool, optional: Whether to use difflib.SequenceMatcher.quick_ratio function for computing similarity or not. Use difflib.SequenceMatcher.ratio function by default. Defaults to False.

Returns

float: Similarity as a float in the range 0.0 to 1.0.

Expand source code

def structural_similarity(
    html1: str, html2: str, use_quick_ratio: bool = False
) -> float:
    """Computes the structural similarity between two DOM trees

    Args:
        html1 (str): HTML string
        html2 (str): HTML string
        use_quick_ratio (bool, optional): Whether to use difflib.SequenceMatcher.quick_ratio function for computing similarity or not. Use difflib.SequenceMatcher.ratio function by default. Defaults to False.

    Returns:
        float: Similarity as a float in the range 0.0 to 1.0.
    """
    try:
        parsed1 = parse_html(html1)
        parsed2 = parse_html(html2)
    except Exception:
        return 0.0

    # returns 0.0 if there is a non-html input
    if not is_html(parsed1) or not is_html(parsed2):
        return 0.0

    tags1 = parsed1.tags
    tags2 = parsed2.tags
    diff: SequenceMatcher = SequenceMatcher()
    diff.set_seq1(tags1)
    diff.set_seq2(tags2)

    if use_quick_ratio:
        return diff.quick_ratio()

    return diff.ratio()

def style_similarity(html1: str, html2: str) ‑> float

Computes CSS style similarity between two DOM trees

Args

html1 : str: HTML string
html2 : str: HTML string

Returns

float: Similarity as a float in the range 0.0 to 1.0.

Expand source code

def style_similarity(html1: str, html2: str) -> float:
    """Computes CSS style similarity between two DOM trees

    Args:
        html1 (str): HTML string
        html2 (str): HTML string

    Returns:
        float: Similarity as a float in the range 0.0 to 1.0.
    """
    try:
        parsed1 = parse_html(html1)
        parsed2 = parse_html(html2)
    except Exception:
        return 0.0

    # returns 0.0 if there is a non-html input
    if not is_html(parsed1) or not is_html(parsed2):
        return 0.0

    classes1 = parsed1.classes
    classes2 = parsed2.classes
    return jaccard_similarity(classes1, classes2)