pandas DataFrames Explained

· 7 min read · Updated March 14, 2026 · intermediate
python pandas dataframes data

pandas is the go-to library for data analysis in Python, and its primary data structure is the DataFrame — a two-dimensional labeled array with columns of potentially different types. Think of it like a spreadsheet or SQL table that you can manipulate programmatically.

This guide walks you through everything you need to work with pandas DataFrames effectively.

Installing and Importing pandas

Install pandas first if needed:

pip install pandas

Import it in your Python code:

import pandas as pd

The convention is to alias pandas as pd — you’ll see this throughout all pandas documentation and tutorials.

Creating DataFrames

There are several ways to create a DataFrame:

From a Dictionary

The most common approach:

import pandas as pd

data = {
    "name": ["Alice", "Bob", "Charlie", "Diana"],
    "age": [25, 30, 35, 28],
    "city": ["New York", "San Francisco", "Chicago", "Boston"],
    "salary": [55000, 72000, 68000, 61000]
}

df = pd.DataFrame(data)
print(df)
#       name  age           city  salary
# 0    Alice   25      New York   55000
# 1      Bob   30  San Francisco   72000
# 2  Charlie   35       Chicago   68000
# 3    Diana   28        Boston   61000

From a List of Dictionaries

import pandas as pd

records = [
    {"name": "Alice", "age": 25, "city": "New York"},
    {"name": "Bob", "age": 30, "city": "San Francisco"},
    {"name": "Charlie", "age": 35, "city": "Chicago"}
]

df = pd.DataFrame(records)
print(df)

From a CSV File

import pandas as pd

# Read from a CSV file
df = pd.read_csv("data.csv")

# Or from a URL
df = pd.read_csv("https://example.com/data.csv")

From a NumPy Array

import pandas as pd
import numpy as np

arr = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
df = pd.DataFrame(arr, columns=["a", "b", "c"])
print(df)
#    a  b  c
# 0  1  2  3
# 1  4  5  6
# 2  7  8  9

DataFrame Properties

Every DataFrame has useful attributes:

import pandas as pd

df = pd.DataFrame({
    "name": ["Alice", "Bob", "Charlie"],
    "age": [25, 30, 35],
    "salary": [55000, 72000, 68000]
})

# Shape (rows, columns)
print(df.shape)  # (3, 3)

# Column names
print(df.columns)  # Index(['name', 'age', 'salary'], dtype='object')

# Data types
print(df.dtypes)
# name      object
# age       int64
# salary    int64
# dtype: object

# First few rows
print(df.head())  # First 5 rows
print(df.head(2)) # First 2 rows

# Last few rows
print(df.tail())  # Last 5 rows

# Summary statistics
print(df.describe())
#          age    salary
# count   3.0   30000.0
# mean   30.0  65000.0
# std     5.0  8700...
# min    25.0  55000.0
# max    35.0  72000.0

Selecting Columns

Single Column

import pandas as pd

df = pd.DataFrame({
    "name": ["Alice", "Bob", "Charlie"],
    "age": [25, 30, 35],
    "salary": [55000, 72000, 68000]
})

# Returns a Series
names = df["name"]
print(names)
# 0      Alice
# 1        Bob
# 2    Charlie
# Name: name, dtype: object

Multiple Columns

import pandas as pd

df = pd.DataFrame({
    "name": ["Alice", "Bob", "Charlie"],
    "age": [25, 30, 35],
    "salary": [55000, 72000, 68000]
})

# Returns a DataFrame
subset = df[["name", "salary"]]
print(subset)
#       name  salary
# 0    Alice   55000
# 1      Bob   72000
# 2  Charlie   68000

Using Attributes

import pandas as pd

df = pd.DataFrame({
    "name": ["Alice", "Bob", "Charlie"],
    "age": [25, 30, 35]
})

# You can access columns as attributes (if no spaces/conflicts)
print(df.name)  # Same as df["name"]

Selecting Rows

By Index (iloc)

Integer-based positional indexing:

import pandas as pd

df = pd.DataFrame({
    "name": ["Alice", "Bob", "Charlie", "Diana"],
    "age": [25, 30, 35, 28]
}, index=["a", "b", "c", "d"])

# Single row
print(df.iloc[0])  # First row as Series

# Multiple rows
print(df.iloc[0:3])  # Rows 0, 1, 2

# Specific rows
print(df.iloc[[0, 2]])  # Rows 0 and 2

By Label (loc)

Label-based indexing:

import pandas as pd

df = pd.DataFrame({
    "name": ["Alice", "Bob", "Charlie", "Diana"],
    "age": [25, 30, 35, 28]
}, index=["a", "b", "c", "d"])

# Single row by label
print(df.loc["a"])

# Range of labels
print(df.loc["a":"c"])  # Rows a, b, c

# With column selection
print(df.loc["a":"b", "name":"age"])

Boolean Indexing

Filter rows using conditions:

import pandas as pd

df = pd.DataFrame({
    "name": ["Alice", "Bob", "Charlie", "Diana"],
    "age": [25, 30, 35, 28],
    "salary": [55000, 72000, 68000, 61000]
})

# Single condition
print(df[df["age"] > 28])
#       name  age  salary
# 1      Bob   30   72000
# 2  Charlie   35   68000

# Multiple conditions (use & for AND, | for OR)
print(df[(df["age"] > 25) & (df["salary"] > 60000)])
#       name  age  salary
# 1      Bob   30   72000
# 2  Charlie   35   68000
# 3    Diana   28   61000

# Using query() method
print(df.query("age > 28 and salary > 60000"))

Modifying DataFrames

Adding Columns

import pandas as pd

df = pd.DataFrame({
    "name": ["Alice", "Bob", "Charlie"],
    "age": [25, 30, 35]
})

# Add a new column
df["salary"] = [55000, 72000, 68000]
print(df)

# Add computed column
df["salary_bonus"] = df["salary"] * 0.1

Updating Columns

import pandas as pd

df = pd.DataFrame({
    "name": ["Alice", "Bob", "Charlie"],
    "age": [25, 30, 35]
})

# Update all values in a column
df["age"] = df["age"] + 1

# Conditional update
df.loc[df["age"] > 30, "age"] = 31

Deleting Columns and Rows

import pandas as pd

df = pd.DataFrame({
    "name": ["Alice", "Bob", "Charlie"],
    "age": [25, 30, 35],
    "temp": [1, 2, 3]
})

# Drop columns
df = df.drop(columns=["temp"])
# Or: df.drop("temp", axis=1, inplace=True)

# Drop rows by index
df = df.drop(index=[0, 2])

Renaming

import pandas as pd

df = pd.DataFrame({
    "name": ["Alice", "Bob"],
    "age": [25, 30]
})

# Rename columns
df = df.rename(columns={"name": "full_name", "age": "years"})

# Rename rows
df = df.rename(index={0: "first", 1: "second"})

Handling Missing Data

pandas uses NaN (Not a Number) to represent missing values:

import pandas as pd

df = pd.DataFrame({
    "name": ["Alice", "Bob", "Charlie"],
    "age": [25, None, 35],
    "salary": [55000, 72000, None]
})

# Check for missing values
print(df.isnull())  # Boolean DataFrame
print(df.isnull().sum())  # Count per column

# Drop rows with missing values
df_clean = df.dropna()

# Fill missing values
df_filled = df.fillna(0)
df_filled = df["salary"].fillna(df["salary"].mean())

# Forward fill (use previous value)
df_filled = df.fillna(method="ffill")

# Backward fill (use next value)
df_filled = df.fillna(method="bfill")

Grouping and Aggregation

The groupby() method splits data into groups:

import pandas as pd

df = pd.DataFrame({
    "department": ["Sales", "Sales", "Engineering", "Engineering", "HR"],
    "employee": ["Alice", "Bob", "Charlie", "Diana", "Eve"],
    "salary": [55000, 72000, 85000, 90000, 50000]
})

# Group by department
grouped = df.groupby("department")

# Aggregate
print(grouped["salary"].sum())
print(grouped["salary"].mean())
print(grouped["salary"].min())
print(grouped["salary"].max())

# Multiple aggregations
print(grouped.agg({"salary": ["sum", "mean", "count"]}))

Pivot Tables

import pandas as pd

df = pd.DataFrame({
    "date": ["2024-01", "2024-01", "2024-02", "2024-02"],
    "product": ["A", "B", "A", "B"],
    "sales": [100, 150, 200, 120]
})

# Simple pivot
pivot = df.pivot(index="date", columns="product", values="sales")
print(pivot)

# With aggregation
pivot = df.pivot_table(index="date", columns="product", values="sales", aggfunc="sum")

Merging and Joining

import pandas as pd

# Two sample DataFrames
df1 = pd.DataFrame({"id": [1, 2, 3], "name": ["Alice", "Bob", "Charlie"]})
df2 = pd.DataFrame({"id": [1, 2, 4], "salary": [55000, 72000, 68000]})

# Inner join (only matching IDs)
merged = pd.merge(df1, df2, on="id", how="inner")

# Left join (all from left)
merged = pd.merge(df1, df2, on="id", how="left")

# Outer join (all from both)
merged = pd.merge(df1, df2, on="id", how="outer")

# Concatenate (stack rows)
df3 = pd.DataFrame({"id": [5], "name": ["Eve"]})
combined = pd.concat([df1, df3])

Sorting

import pandas as pd

df = pd.DataFrame({
    "name": ["Charlie", "Alice", "Bob"],
    "age": [35, 25, 30],
    "salary": [68000, 55000, 72000]
})

# Sort by one column
df_sorted = df.sort_values("age")

# Sort by multiple columns
df_sorted = df.sort_values(["age", "salary"], ascending=[True, False])

# Sort by index
df_sorted = df.sort_index()

# Sort descending
df_sorted = df.sort_values("age", ascending=False)

Getting Started

DataFrames are the workhorse of data analysis in Python. The key concepts to remember:

  • Create DataFrames from dictionaries, CSV files, or databases
  • Select data using loc (labels), iloc (integer positions), or boolean conditions
  • Use groupby() for split-apply-combine operations
  • Handle missing data with dropna(), fillna(), or isnull()
  • Combine DataFrames with merge(), join(), or concat()

From here, explore the pandas groupby functionality for more advanced data analysis, and learn how to read from and write to various file formats.

See Also