html
The html module provides functions for escaping and unescaping HTML special characters, plus the HTMLParser class for parsing HTML and XHTML documents. It’s part of Python’s standard library, so no installation is required.
html.escape()
Converts characters &, <, and > to their HTML-safe equivalents. This is essential when displaying user-generated content in HTML to prevent injection attacks.
Syntax
html.escape(s, quote=True)
Parameters
| Parameter | Type | Default | Description |
|---|---|---|---|
s | str | — | The string to escape |
quote | bool | True | If True, also escapes " and ' characters |
Examples
Basic escaping
import html
text = "<script>alert('xss')</script>"
escaped = html.escape(text)
print(escaped)
# <script>alert('xss')</script>
When rendering user input in HTML, escaping prevents malicious scripts from executing:
user_comment = "Great post! <a href='bad.com'>click here</a>"
print(html.escape(user_comment))
# Great post! <a href='bad.com'>click here</a>
Escaping without quotes
Use quote=False when the string will not be placed inside HTML attributes:
html.escape("<div>Hello</div>", quote=False)
# '<div>Hello</div>'
html.unescape()
Converts HTML character references (both named entities like > and numeric references like > or >) back to their Unicode characters.
Syntax
html.unescape(s)
Parameters
| Parameter | Type | Default | Description |
|---|---|---|---|
s | str | — | The string containing HTML character references to convert |
Examples
Converting named entities
import html
text = "<div>&"test"""
print(html.unescape(text))
# <div>&"test""
Converting numeric references
print(html.unescape("<script>")) # decimal
# <script>
print(html.unescape("<script>")) # hexadecimal
# <script>
This is useful when processing HTML content from APIs or databases that store escaped content:
api_response = "Product: "Widget" | Price: &34.99"
print(html.unescape(api_response))
# Product: "Widget" | Price: &34.99
html.parser.HTMLParser
A versatile class for parsing HTML and XHTML documents. You subclass it and override handler methods to process different parts of the document.
Syntax
html.parser.HTMLParser(*, convert_charrefs=True, scripting=False)
Parameters
| Parameter | Type | Default | Description |
|---|---|---|---|
convert_charrefs | bool | True | If True, automatically converts character references to Unicode |
scripting | bool | False | If True, returns noscript content unparsed |
Methods
| Method | Description |
|---|---|
feed(data) | Feed HTML data to the parser |
close() | Force processing of all buffered data |
reset() | Reset the parser state |
getpos() | Return current line number and offset |
get_starttag_text() | Return text of the most recent start tag |
Handler Methods to Override
Override these in your subclass:
| Method | Called For |
|---|---|
handle_starttag(tag, attrs) | Start tags like <div id="main"> |
handle_endtag(tag) | End tags like </div> |
handle_data(data) | Text content |
handle_comment(data) | HTML comments <!-- --> |
handle_startendtag(tag, attrs) | Self-closing tags like <img /> |
handle_entityref(name) | Named entities like & |
handle_charref(name) | Numeric entities like A |
Examples
Extracting links from HTML
from html.parser import HTMLParser
class LinkExtractor(HTMLParser):
def __init__(self):
super().__init__()
self.links = []
def handle_starttag(self, tag, attrs):
if tag == 'a':
for name, value in attrs:
if name == 'href':
self.links.append(value)
parser = LinkExtractor()
parser.feed('''<html>
<body>
<a href="https://example.com">Example</a>
<a href="https://python.org">Python</a>
</body>
</html>''')
print(parser.links)
# ['https://example.com', 'https://python.org']
Extracting all text content
from html.parser import HTMLParser
class TextExtractor(HTMLParser):
def __init__(self):
super().__init__()
self.text_parts = []
def handle_data(self, data):
cleaned = data.strip()
if cleaned:
self.text_parts.append(cleaned)
parser = TextExtractor()
parser.feed('<h1>Welcome</h1><p>This is a <strong>test</strong>.</p>')
print(' '.join(parser.text_parts))
# Welcome This is a test.
Parsing with preserved entity references
By default, HTMLParser converts character references automatically. To handle them manually:
from html.parser import HTMLParser
class EntityParser(HTMLParser):
def handle_entityref(self, name):
import html.entities
if name in html.entities.name2codepoint:
char = chr(html.entities.name2codepoint[name])
print(f"Named entity &{name}; -> {char}")
def handle_charref(self, name):
if name.startswith('x'):
char = chr(int(name[1:], 16))
else:
char = chr(int(name))
print(f"Char ref &#{name}; -> {char}")
parser = EntityParser(convert_charrefs=False)
parser.feed("<html> and <also> and <x>")
# Named entity < -> <
# Named entity > -> >
# Char ref < -> <
# Char ref > -> >
# Char ref < -> <
# Char ref > -> >
Common Patterns
Preventing XSS in web applications
Always escape user input before rendering:
import html
def safe_render(user_input):
return html.escape(user_input)
# Before: could execute as JavaScript
# After: rendered as harmless text
print(safe_render("<img src=x onerror=alert(1)>"))
# <img src=x onerror=alert(1)>
Processing HTML from external sources
import html
from html.parser import HTMLParser
# First unescape any pre-encoded entities
raw_html = "<div>Hello</div>"
cleaned = html.unescape(raw_html)
# Then parse
class SimpleParser(HTMLParser):
def handle_data(self, data):
print(f"Found text: {data}")
SimpleParser().feed(cleaned)
# Found text: <div>Hello</div>
Extracting metadata from HTML documents
from html.parser import HTMLParser
class MetaExtractor(HTMLParser):
def __init__(self):
super().__init__()
self.title = None
self.meta = {}
def handle_starttag(self, tag, attrs):
attrs_dict = dict(attrs)
if tag == 'title':
pass # Title content comes in handle_data
elif tag == 'meta':
name = attrs_dict.get('name') or attrs_dict.get('property', '')
content = attrs_dict.get('content', '')
if name and content:
self.meta[name] = content
def handle_data(self, data):
if self.title is None:
self.title = data.strip()
html_doc = '''
<html>
<head>
<title>My Page</title>
<meta name="description" content="A tutorial">
<meta property="og:title" content="Social Title">
</head>
</html>
'''
extractor = MetaExtractor()
extractor.feed(html_doc)
print(f"Title: {extractor.title}")
print(f"Meta: {extractor.meta}")
# Title: My Page
# Meta: {'description': 'A tutorial', 'og:title': 'Social Title'}