Creating and Extracting Archives with tarfile
The tarfile module is Python’s built-in tool for working with tar archives—those classic Unix file packaging formats you see everywhere. Whether you’re bundling logs for rotation, creating backups, or distributing software packages, tarfile gives you full control over tar archives directly in Python.
Why Use tarfile?
Tar archives bundle multiple files and directories into a single stream, often compressed. They’re the backbone of software distribution, backups, and system administration. Python’s tarfile module lets you:
- Create new tar archives from files on disk
- Extract existing archives to directories
- Read archive contents without extracting
- Append files to existing archives
- Work with both plain tar and compressed variants (
.tar.gz,.tar.bz2,.tar.xz)
Reading an Archive
Let’s start by reading an existing archive to see what’s inside:
import tarfile
# Open and list contents
with tarfile.open("example.tar", "r") as tar:
# List all members
for member in tar.getmembers():
print(f"{member.name:40} {member.size:>10} bytes")
The getmembers() returns TarInfo objects with metadata about each entry:
import tarfile
with tarfile.open("example.tar", "r") as tar:
# Get a specific member without extracting
member = tar.getmember("README.txt")
print(f"Name: {member.name}")
print(f"Size: {member.size}")
print(f"Modified: {member.mtime}")
print(f"Mode: {oct(member.mode)}")
Extracting Archives
Extracting everything is one line of code:
import tarfile
# Extract all to current directory
with tarfile.open("example.tar", "r") as tar:
tar.extractall()
Extract to a specific folder:
import tarfile
# Extract to a specific directory
with tarfile.open("example.tar", "r") as tar:
tar.extractall(path="/tmp/extracted")
Extract a single file:
import tarfile
with tarfile.open("example.tar", "r") as tar:
# Extract just one file
tar.extract("specific_file.txt")
# Or extract to a specific location
tar.extract("specific_file.txt", path="/tmp/my extraction/")
Creating Archives
Creating a new archive is straightforward:
import tarfile
import os
# Create a new tar archive
with tarfile.open("backup.tar", "w") as tar:
# Add individual files
tar.add("config.yaml")
tar.add("data.json")
# Add a directory (includes all contents)
tar.add("my_project/", arcname="project")
The arcname parameter lets you rename the entry inside the archive.
Add files selectively:
import tarfile
def filter_function(tarinfo):
"""Filter which files to include in the archive."""
# Skip hidden files
if os.path.basename(tarinfo.name).startswith('.'):
return None
# Modify permissions
tarinfo.mode = 0o644
return tarinfo
with tarfile.open("selective.tar", "w") as tar:
tar.add("src/", filter=filter_function)
tar.add("tests/", filter=filter_function)
Returning None from the filter excludes that file.
Working with Compressed Archives
The tarfile module auto-detects compression based on the mode:
import tarfile
# Read .tar.gz (gzip compressed)
with tarfile.open("archive.tar.gz", "r:gz") as tar:
tar.extractall()
# Read .tar.bz2 (bzip2 compressed)
with tarfile.open("archive.tar.bz2", "r:bz2") as tar:
tar.extractall()
# Read .tar.xz (lzma compressed)
with tarfile.open("archive.tar.xz", "r:xz") as tar:
tar.extractall()
Creating compressed archives uses the same pattern:
import tarfile
# Create a gzip-compressed archive
with tarfile.open("backup.tar.gz", "w:gz") as tar:
tar.add("my_folder/")
# Create a bz2-compressed archive
with tarfile.open("backup.tar.bz2", "w:bz2") as tar:
tar.add("my_folder/")
Common compression modes:
worr— uncompressedw:gzorr:gz— gzipw:bz2orr:bz2— bzip2w:xzorr:xz— lzma/xz
Reading Files Without Extracting
Extract a file’s contents directly to memory:
import tarfile
with tarfile.open("archive.tar.gz", "r:gz") as tar:
# Extract file to memory
member = tar.getmember("README.md")
f = tar.extractfile(member)
contents = f.read()
print(contents.decode("utf-8"))
This is useful for inspecting archives without touching the filesystem.
Appending to Archives
Add files to an existing archive:
import tarfile
# Open in append mode
with tarfile.open("existing.tar", "a") as tar:
tar.add("new_file.txt")
tar.add("another_file.txt")
Be careful: append mode doesn’t work with compressed archives.
Practical Examples
Creating a Timestamped Backup
import tarfile
from datetime import datetime
import os
def create_backup(source_dir, backup_name):
"""Create a timestamped backup."""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
backup_path = f"{backup_name}_{timestamp}.tar.gz"
with tarfile.open(backup_path, "w:gz") as tar:
tar.add(source_dir, arcname=os.path.basename(source_dir))
print(f"Backup created: {backup_path}")
return backup_path
# Usage
create_backup("my_project", "project_backup")
# Creates: project_backup_20260314_214500.tar.gz
Creating a Distribution Package
import tarfile
import os
def create_distribution(package_name, version, directories):
"""Create a distribution tarball."""
filename = f"{package_name}-{version}.tar.gz"
with tarfile.open(filename, "w:gz") as tar:
for directory in directories:
tar.add(
directory,
arcname=f"{package_name}-{version}/{os.path.basename(directory)}"
)
# Show archive size
size_mb = os.path.getsize(filename) / (1024 * 1024)
print(f"Created {filename} ({size_mb:.2f} MB)")
# Usage
create_distribution("mylib", "1.0.0", ["src/", "tests/", "README.md"])
Extracting Specific File Types
import tarfile
import os
def extract_python_files(archive_path, dest_path):
"""Extract only Python files from an archive."""
with tarfile.open(archive_path, "r") as tar:
for member in tar.getmembers():
if member.name.endswith('.py'):
tar.extract(member, path=dest_path)
print(f"Extracted: {member.name}")
# Usage
extract_python_files("project.tar.gz", "./python_files/")
Streaming Large Archives
For very large archives, process them in a streaming fashion:
import tarfile
import io
def list_archive_contents(archive_path):
"""List contents without loading entire archive into memory."""
with tarfile.open(archive_path, "r:*") as tar:
# Iterate without extracting everything
for member in tar:
if member.isfile():
print(f"File: {member.name} ({member.size} bytes)")
elif member.isdir():
print(f"Dir: {member.name}/")
elif member.issym():
print(f"Link: {member.name} -> {member.linkname}")
The "r:*" mode auto-detects compression while streaming.
Error Handling
Handle common errors gracefully:
import tarfile
def safe_extract(archive_path, dest_path):
"""Extract archive with proper error handling."""
try:
with tarfile.open(archive_path, "r:*") as tar:
# Security: prevent path traversal
for member in tar.getmembers():
if member.name.startswith('/') or '..' in member.name:
raise ValueError(f"Unsafe member: {member.name}")
tar.extractall(path=dest_path)
return True
except tarfile.TarError as e:
print(f"Archive error: {e}")
return False
except PermissionError as e:
print(f"Permission denied: {e}")
return False
See Also
- tarfile-module — Official module reference
- zipfile-module — For ZIP archive handling
- python-zlib-compression — Compression fundamentals
- pathlib-guide — Working with file paths