html

Updated March 13, 2026 · Modules

html web escaping parsing entities

The html module provides functions for escaping and unescaping HTML special characters, plus the HTMLParser class for parsing HTML and XHTML documents. It’s part of Python’s standard library, so no installation is required.

html.escape()

Converts characters &, <, and > to their HTML-safe equivalents. This is essential when displaying user-generated content in HTML to prevent injection attacks.

Syntax

html.escape(s, quote=True)

Parameters

Parameter	Type	Default	Description
`s`	`str`	—	The string to escape
`quote`	`bool`	`True`	If `True`, also escapes `"` and `'` characters

Examples

Basic escaping

import html

text = "<script>alert('xss')</script>"
escaped = html.escape(text)
print(escaped)
# &lt;script&gt;alert(&#x27;xss&#x27;)&lt;/script&gt;

When rendering user input in HTML, escaping prevents malicious scripts from executing:

user_comment = "Great post! <a href='bad.com'>click here</a>"
print(html.escape(user_comment))
# Great post! &lt;a href=&#x27;bad.com&#x27;&gt;click here&lt;/a&gt;

Escaping without quotes

Use quote=False when the string will not be placed inside HTML attributes:

html.escape("<div>Hello</div>", quote=False)
# '&lt;div&gt;Hello&lt;/div&gt;'

html.unescape()

Converts HTML character references (both named entities like > and numeric references like > or >) back to their Unicode characters.

Syntax

html.unescape(s)

Parameters

Parameter	Type	Default	Description
`s`	`str`	—	The string containing HTML character references to convert

Examples

Converting named entities

import html

text = "&lt;div&gt;&amp;&quot;test&quot;&quot;"
print(html.unescape(text))
# <div>&"test""

Converting numeric references

print(html.unescape("&#60;script&#62;"))        # decimal
# <script>

print(html.unescape("&#x3C;script&#x3E;"))     # hexadecimal
# <script>

This is useful when processing HTML content from APIs or databases that store escaped content:

api_response = "Product: &quot;Widget&quot; | Price: &amp;34.99"
print(html.unescape(api_response))
# Product: "Widget" | Price: &34.99

html.parser.HTMLParser

A versatile class for parsing HTML and XHTML documents. You subclass it and override handler methods to process different parts of the document.

Syntax

html.parser.HTMLParser(*, convert_charrefs=True, scripting=False)

Parameters

Parameter	Type	Default	Description
`convert_charrefs`	`bool`	`True`	If `True`, automatically converts character references to Unicode
`scripting`	`bool`	`False`	If `True`, returns noscript content unparsed

Methods

Method	Description
`feed(data)`	Feed HTML data to the parser
`close()`	Force processing of all buffered data
`reset()`	Reset the parser state
`getpos()`	Return current line number and offset
`get_starttag_text()`	Return text of the most recent start tag

Handler Methods to Override

Override these in your subclass:

Method	Called For
`handle_starttag(tag, attrs)`	Start tags like `<div id="main">`
`handle_endtag(tag)`	End tags like `</div>`
`handle_data(data)`	Text content
`handle_comment(data)`	HTML comments `<!-- -->`
`handle_startendtag(tag, attrs)`	Self-closing tags like `<img />`
`handle_entityref(name)`	Named entities like `&`
`handle_charref(name)`	Numeric entities like `A`

Examples

Extracting links from HTML

from html.parser import HTMLParser

class LinkExtractor(HTMLParser):
    def __init__(self):
        super().__init__()
        self.links = []
    
    def handle_starttag(self, tag, attrs):
        if tag == 'a':
            for name, value in attrs:
                if name == 'href':
                    self.links.append(value)

parser = LinkExtractor()
parser.feed('''<html>
    <body>
        <a href="https://example.com">Example</a>
        <a href="https://python.org">Python</a>
    </body>
</html>''')

print(parser.links)
# ['https://example.com', 'https://python.org']

Extracting all text content

from html.parser import HTMLParser

class TextExtractor(HTMLParser):
    def __init__(self):
        super().__init__()
        self.text_parts = []
    
    def handle_data(self, data):
        cleaned = data.strip()
        if cleaned:
            self.text_parts.append(cleaned)

parser = TextExtractor()
parser.feed('<h1>Welcome</h1><p>This is a <strong>test</strong>.</p>')
print(' '.join(parser.text_parts))
# Welcome This is a test.

Parsing with preserved entity references

By default, HTMLParser converts character references automatically. To handle them manually:

from html.parser import HTMLParser

class EntityParser(HTMLParser):
    def handle_entityref(self, name):
        import html.entities
        if name in html.entities.name2codepoint:
            char = chr(html.entities.name2codepoint[name])
            print(f"Named entity &{name}; -> {char}")
    
    def handle_charref(self, name):
        if name.startswith('x'):
            char = chr(int(name[1:], 16))
        else:
            char = chr(int(name))
        print(f"Char ref &#{name}; -> {char}")

parser = EntityParser(convert_charrefs=False)
parser.feed("&lt;html&gt; and &#60;also&#62; and &#x3C;x&#x3E;")
# Named entity &lt; -> <
# Named entity &gt; -> >
# Char ref &#60; -> <
# Char ref &#62; -> >
# Char ref &#x3C; -> <
# Char ref &#x3E; -> >

Common Patterns

Preventing XSS in web applications

Always escape user input before rendering:

import html

def safe_render(user_input):
    return html.escape(user_input)

# Before: could execute as JavaScript
# After: rendered as harmless text
print(safe_render("<img src=x onerror=alert(1)>"))
# &lt;img src=x onerror=alert(1)&gt;

Processing HTML from external sources

import html
from html.parser import HTMLParser

# First unescape any pre-encoded entities
raw_html = "&lt;div&gt;Hello&lt;/div&gt;"
cleaned = html.unescape(raw_html)

# Then parse
class SimpleParser(HTMLParser):
    def handle_data(self, data):
        print(f"Found text: {data}")

SimpleParser().feed(cleaned)
# Found text: <div>Hello</div>

Extracting metadata from HTML documents

from html.parser import HTMLParser

class MetaExtractor(HTMLParser):
    def __init__(self):
        super().__init__()
        self.title = None
        self.meta = {}
    
    def handle_starttag(self, tag, attrs):
        attrs_dict = dict(attrs)
        if tag == 'title':
            pass  # Title content comes in handle_data
        elif tag == 'meta':
            name = attrs_dict.get('name') or attrs_dict.get('property', '')
            content = attrs_dict.get('content', '')
            if name and content:
                self.meta[name] = content
    
    def handle_data(self, data):
        if self.title is None:
            self.title = data.strip()

html_doc = '''
<html>
<head>
    <title>My Page</title>
    <meta name="description" content="A tutorial">
    <meta property="og:title" content="Social Title">
</head>
</html>
'''

extractor = MetaExtractor()
extractor.feed(html_doc)
print(f"Title: {extractor.title}")
print(f"Meta: {extractor.meta}")
# Title: My Page
# Meta: {'description': 'A tutorial', 'og:title': 'Social Title'}

html.escape()

Syntax

Parameters

Examples

Basic escaping

Escaping without quotes

html.unescape()

Syntax

Parameters

Examples

Converting named entities

Converting numeric references

html.parser.HTMLParser

Syntax

Parameters

Methods

Handler Methods to Override

Examples

Extracting links from HTML

Extracting all text content

Parsing with preserved entity references

Common Patterns

Preventing XSS in web applications

Processing HTML from external sources

Extracting metadata from HTML documents

See Also