html

Updated March 13, 2026 · Modules
html web escaping parsing entities

The html module provides functions for escaping and unescaping HTML special characters, plus the HTMLParser class for parsing HTML and XHTML documents. It’s part of Python’s standard library, so no installation is required.

html.escape()

Converts characters &, <, and > to their HTML-safe equivalents. This is essential when displaying user-generated content in HTML to prevent injection attacks.

Syntax

html.escape(s, quote=True)

Parameters

ParameterTypeDefaultDescription
sstrThe string to escape
quoteboolTrueIf True, also escapes " and ' characters

Examples

Basic escaping

import html

text = "<script>alert('xss')</script>"
escaped = html.escape(text)
print(escaped)
# &lt;script&gt;alert(&#x27;xss&#x27;)&lt;/script&gt;

When rendering user input in HTML, escaping prevents malicious scripts from executing:

user_comment = "Great post! <a href='bad.com'>click here</a>"
print(html.escape(user_comment))
# Great post! &lt;a href=&#x27;bad.com&#x27;&gt;click here&lt;/a&gt;

Escaping without quotes

Use quote=False when the string will not be placed inside HTML attributes:

html.escape("<div>Hello</div>", quote=False)
# '&lt;div&gt;Hello&lt;/div&gt;'

html.unescape()

Converts HTML character references (both named entities like &gt; and numeric references like &#62; or &#x3E;) back to their Unicode characters.

Syntax

html.unescape(s)

Parameters

ParameterTypeDefaultDescription
sstrThe string containing HTML character references to convert

Examples

Converting named entities

import html

text = "&lt;div&gt;&amp;&quot;test&quot;&quot;"
print(html.unescape(text))
# <div>&"test""

Converting numeric references

print(html.unescape("&#60;script&#62;"))        # decimal
# <script>

print(html.unescape("&#x3C;script&#x3E;"))     # hexadecimal
# <script>

This is useful when processing HTML content from APIs or databases that store escaped content:

api_response = "Product: &quot;Widget&quot; | Price: &amp;34.99"
print(html.unescape(api_response))
# Product: "Widget" | Price: &34.99

html.parser.HTMLParser

A versatile class for parsing HTML and XHTML documents. You subclass it and override handler methods to process different parts of the document.

Syntax

html.parser.HTMLParser(*, convert_charrefs=True, scripting=False)

Parameters

ParameterTypeDefaultDescription
convert_charrefsboolTrueIf True, automatically converts character references to Unicode
scriptingboolFalseIf True, returns noscript content unparsed

Methods

MethodDescription
feed(data)Feed HTML data to the parser
close()Force processing of all buffered data
reset()Reset the parser state
getpos()Return current line number and offset
get_starttag_text()Return text of the most recent start tag

Handler Methods to Override

Override these in your subclass:

MethodCalled For
handle_starttag(tag, attrs)Start tags like <div id="main">
handle_endtag(tag)End tags like </div>
handle_data(data)Text content
handle_comment(data)HTML comments <!-- -->
handle_startendtag(tag, attrs)Self-closing tags like <img />
handle_entityref(name)Named entities like &amp;
handle_charref(name)Numeric entities like &#65;

Examples

from html.parser import HTMLParser

class LinkExtractor(HTMLParser):
    def __init__(self):
        super().__init__()
        self.links = []
    
    def handle_starttag(self, tag, attrs):
        if tag == 'a':
            for name, value in attrs:
                if name == 'href':
                    self.links.append(value)

parser = LinkExtractor()
parser.feed('''<html>
    <body>
        <a href="https://example.com">Example</a>
        <a href="https://python.org">Python</a>
    </body>
</html>''')

print(parser.links)
# ['https://example.com', 'https://python.org']

Extracting all text content

from html.parser import HTMLParser

class TextExtractor(HTMLParser):
    def __init__(self):
        super().__init__()
        self.text_parts = []
    
    def handle_data(self, data):
        cleaned = data.strip()
        if cleaned:
            self.text_parts.append(cleaned)

parser = TextExtractor()
parser.feed('<h1>Welcome</h1><p>This is a <strong>test</strong>.</p>')
print(' '.join(parser.text_parts))
# Welcome This is a test.

Parsing with preserved entity references

By default, HTMLParser converts character references automatically. To handle them manually:

from html.parser import HTMLParser

class EntityParser(HTMLParser):
    def handle_entityref(self, name):
        import html.entities
        if name in html.entities.name2codepoint:
            char = chr(html.entities.name2codepoint[name])
            print(f"Named entity &{name}; -> {char}")
    
    def handle_charref(self, name):
        if name.startswith('x'):
            char = chr(int(name[1:], 16))
        else:
            char = chr(int(name))
        print(f"Char ref &#{name}; -> {char}")

parser = EntityParser(convert_charrefs=False)
parser.feed("&lt;html&gt; and &#60;also&#62; and &#x3C;x&#x3E;")
# Named entity &lt; -> <
# Named entity &gt; -> >
# Char ref &#60; -> <
# Char ref &#62; -> >
# Char ref &#x3C; -> <
# Char ref &#x3E; -> >

Common Patterns

Preventing XSS in web applications

Always escape user input before rendering:

import html

def safe_render(user_input):
    return html.escape(user_input)

# Before: could execute as JavaScript
# After: rendered as harmless text
print(safe_render("<img src=x onerror=alert(1)>"))
# &lt;img src=x onerror=alert(1)&gt;

Processing HTML from external sources

import html
from html.parser import HTMLParser

# First unescape any pre-encoded entities
raw_html = "&lt;div&gt;Hello&lt;/div&gt;"
cleaned = html.unescape(raw_html)

# Then parse
class SimpleParser(HTMLParser):
    def handle_data(self, data):
        print(f"Found text: {data}")

SimpleParser().feed(cleaned)
# Found text: <div>Hello</div>

Extracting metadata from HTML documents

from html.parser import HTMLParser

class MetaExtractor(HTMLParser):
    def __init__(self):
        super().__init__()
        self.title = None
        self.meta = {}
    
    def handle_starttag(self, tag, attrs):
        attrs_dict = dict(attrs)
        if tag == 'title':
            pass  # Title content comes in handle_data
        elif tag == 'meta':
            name = attrs_dict.get('name') or attrs_dict.get('property', '')
            content = attrs_dict.get('content', '')
            if name and content:
                self.meta[name] = content
    
    def handle_data(self, data):
        if self.title is None:
            self.title = data.strip()

html_doc = '''
<html>
<head>
    <title>My Page</title>
    <meta name="description" content="A tutorial">
    <meta property="og:title" content="Social Title">
</head>
</html>
'''

extractor = MetaExtractor()
extractor.feed(html_doc)
print(f"Title: {extractor.title}")
print(f"Meta: {extractor.meta}")
# Title: My Page
# Meta: {'description': 'A tutorial', 'og:title': 'Social Title'}

See Also