df.size# 9 → total elements
df.ndim# 2 → always for DataFrame
df.values# underlying NumPy array
df.empty# True if no elements
2
Exploring Data
EDA first step
quick look
df.head() # first 5 rows
df.head(10) # first 10 rows
df.tail() # last 5 rows
df.tail(3) # last 3 rows
df.sample(5) # 5 random rows
df.sample(frac=0.1) # random 10%
structure & types
df.info() # dtypes + null counts + memory
df.describe() # stats for numeric columns
df.describe(include='all') # all cols
df.describe(include='object') # strings
df.shape# (rows, cols)
df.dtypes# type per column
df.columns# column names
counts & frequencies
df.count() # non-null per col
df.nunique() # unique count per col
df['col'].value_counts() # frequency table
df['col'].value_counts(normalize=True)
df.isnull().sum() # NaN count per col
df.memory_usage(deep=True) # bytes per col
interactive property inspector
click a method/property
Click any method above to inspect it
3
Selecting Columns
column access
single column → Series
df['name'] # returns Series
df.name# dot notation (avoid spaces)
Dot notation breaks when column name has spaces or conflicts with a DataFrame method (like df.count). Use [] for safety.
df.drop(columns=['score'])
df[df.columns.difference(['score'])]
# Select by dtype
df.select_dtypes(include='number')
df.select_dtypes(include='object')
df.select_dtypes(exclude='float')
# Select by column name pattern
df.filter(like='sales') # contains 'sales'
df.filter(regex='^q') # starts with q
reorder columns
df = df[['score','name','age']] # manual reorder
df = df[sorted(df.columns)] # alphabetical
df = df[['name'] + [c for c in df.columns if c != 'name']] # name first
4
Selecting Rows — loc, iloc, at
row access
.loc — label based (INCLUSIVE)
df.loc[0] # row label 0
df.loc[0:2] # rows 0,1,2 (end inclusive!)
df.loc[0, 'name'] # single value
df.loc[0:2, 'name':'age'] # rows + cols
df.loc[[0,2], 'name'] # rows 0 and 2
df.at[0, 'name'] # fastest single cell
.loc slice end is INCLUSIVE. So df.loc[0:2] returns 3 rows (0, 1, 2).
.iloc — position based (EXCLUSIVE end)
df.iloc[0] # first row
df.iloc[0:2] # rows 0,1 (end exclusive)
df.iloc[0, 1] # row 0, col 1
df.iloc[-1] # last row
df.iloc[:, 0] # all rows, first col
df.iloc[0:3, 0:2] # slice rows + cols
df.iat[0, 1] # fastest scalar
.iloc slice end is EXCLUSIVE (like Python lists). df.iloc[0:2] → 2 rows.
loc vs iloc — key difference with non-default index
DATAFRAME
A
5
10
10
20
15
30
df.loc[5] → label 5
A = 10
Uses the INDEX LABEL 5
df.iloc[0] → position 0
A = 10
Uses POSITION 0 (1st row)
Avoid chained indexing: df['col'][0]. Use df.loc[0,'col'] or df.iloc[0,0] — chaining causes SettingWithCopyWarning and may not modify the original.
# AND — both must be true
df[(df['age'] > 25) & (df['score'] >= 90)]
# OR — either must be true
df[(df['age'] > 25) | (df['score'] >= 90)]
# NOT — negate
df[~(df['age'] > 25)]
Always wrap each condition in parentheses when using & or |. Without them Python's operator precedence causes errors.
special filters — isin, between, str
# isin — value in list
df[df['name'].isin(['Alice','Charlie'])]
df[~df['name'].isin(['Bob'])] # NOT in# between — inclusive range
df[df['age'].between(20, 30)]
# String filters
df[df['name'].str.startswith('A')]
df[df['name'].str.contains('lic')]
df[df['name'].str.contains('al',case=False)]
6
Adding & Removing Columns / Rows
modify
adding columns
# Direct assignment
df['grade'] = 'A'# constant
df['age_plus_10'] = df['age'] + 10# insert() — at specific position
df.insert(1, 'grade', 'A')
# assign() — method chaining
df = df.assign(
grade='A',
age_plus_10=df['age'] + 10
)
# Conditional column via apply
df['result'] = df['score'].apply(
lambda x: 'Pass'if x >= 60else'Fail'
)
removing columns
# drop() — preferred
df.drop(columns=['grade'])
df.drop(columns=['grade','age_plus_10'])
df.drop('grade', axis=1) # axis=1
df.drop(columns=['grade'], inplace=True)
# del — modifies in placedel df['grade']
# pop() — removes and returns
removed = df.pop('grade')
df.drop(0) # drop row label 0
df.drop([0,1,2]) # drop multiple rows
df.drop_duplicates() # remove duplicate rows
df.drop_duplicates(subset=['name']) # based on column
df.drop_duplicates(keep='last') # keep last occurrence
7
Sorting Data
order
sort by values
df.sort_values('age') # asc
df.sort_values('age', ascending=False)
# Multi-column sort
df.sort_values(['age','score'])
df.sort_values(['age','score'],
ascending=[True,False])
# Sort and reset index
df.sort_values('age').reset_index(drop=True)
sort by index & top N
df.sort_index()
df.sort_index(ascending=False)
# Top / bottom N rows (faster than sort + head)
df.nlargest(3, 'score') # top 3 scores
df.nsmallest(3, 'age') # youngest 3
df.nlargest(3, ['score','age']) # tie-break
nlargest / nsmallest are faster than sort_values().head(n) for large DataFrames.
8
Renaming & Reindexing
labels
rename columns
# dict mapping
df.rename(columns={'name':'full_name','age':'years'})
# function on all columns
df.rename(columns=str.upper)
df.rename(columns=str.lower)
df.rename(columns=lambda x: x.strip())
# Reassign all at once
df.columns = ['full_name','years','score']
# Add prefix/suffix
df.add_prefix('col_') # col_name, col_age
df.add_suffix('_2024') # name_2024, age_2024
set & reset index
# Set a column as the row index
df.set_index('name')
df.set_index('name', inplace=True)
# Reset back to 0, 1, 2...
df.reset_index() # old index → column
df.reset_index(drop=True) # discard old index# Rename index
df.index.name = 'row_id'
df.rename_axis('row_id')
df.rename_axis(None) # remove name
9
Handling Missing Data
data quality
detect
df.isnull() # bool DataFrame
df.notnull()
df.isnull().sum() # NaN count per col
df.isnull().sum().sum() # total NaN count
df.isnull().any() # any NaN per col?
df.isnull().all() # all NaN per col?
df.isnull().mean() # fraction NaN per col
drop
df.dropna() # any NaN → drop row
df.dropna(how='all') # all NaN → drop row
df.dropna(subset=['age']) # NaN in 'age' → drop
df.dropna(axis=1) # drop NaN columns
df.dropna(thresh=2) # keep if ≥2 non-NaN
fill
df.fillna(0) # fill all NaN with 0
df['age'].fillna(0)
df.fillna({'age':0,'score':50}) # per column# Fill with statistics
df['age'].fillna(df['age'].mean())
df['age'].fillna(df['age'].median())
df['name'].fillna(df['name'].mode()[0])
# Forward / backward fill
df.ffill() df.bfill()
# Interpolate
df['score'].interpolate() # linear
category dtype saves up to 90% memory for columns with few unique string values (gender, country, status).
dtype quick reference
dtype
python type
memory
use for
int64
int
8 bytes
whole numbers (default)
int8/16/32
int
1/2/4 bytes
small integers — save memory
float64
float
8 bytes
decimals (default)
float32
float
4 bytes
ML model inputs
object
str/mixed
variable
strings (inefficient)
category
Categorical
small
repeated strings (gender, country)
bool
bool
1 byte
True/False flags
datetime64
datetime
8 bytes
date/time values
Int64 (cap I)
int+NaN
8 bytes
nullable integer
11
String Operations (.str)
text data
case, clean & search
s = df['name']
s.str.upper() s.str.lower()
s.str.title() s.str.capitalize()
s.str.strip() # remove whitespace
s.str.len() # length per element
s.str.contains('ali', case=False)
s.str.startswith('A')
s.str.endswith('e')
s.str.find('li') # position of first match
split, replace & extract
s.str.split(',') # → list
s.str.split(',', expand=True) # → DataFrame
s.str.split(',').str[0] # first part
s.str.replace('alice','Alice')
s.str.replace(r'\d+','', regex=True)
s.str.extract(r'(\d+)') # capture group
s.str.extractall(r'(\d+)')
s.str.isdigit() s.str.isalpha()
s.str.zfill(5) # zero-pad to width 5
Real-world uses: cleaning scraped text · email/domain extraction · parsing structured strings in a column · standardizing casing · feature extraction with regex · phone number formatting
Alternative: df['col'].apply(lambda x: x.upper()) — works but 5–10× slower than .str. Always prefer .str for vectorized string operations.
g = df.groupby('department')
g['salary'].mean()
g['salary'].sum()
g['salary'].count()
g['salary'].max()
g['salary'].min()
g['salary'].std()
g['salary'].median()
g['salary'].nunique()
# All numeric columns
g.mean()
g.describe()
multiple aggregations — agg()
# List of agg functions
g['salary'].agg(['mean','min','max'])
# Dict — different agg per column
g.agg({
'salary': ['mean','max'],
'age': 'median',
'name': 'count'
})
# Named aggregations (clean column names)
g.agg(
avg_salary=( 'salary','mean'),
max_salary=( 'salary','max'),
headcount=( 'name', 'count')
)
transform — keeps original index
# Returns same shape as original df# Perfect for adding group stats to rows
df['dept_avg'] = df.groupby('dept')['salary'].\
transform('mean')
df['rank_in_dept'] = df.groupby('dept')['salary'].\
transform('rank')
transform() is one of the most powerful GroupBy tools — it lets you add group-level aggregations as a new column while keeping all original rows intact.
filter & apply
# filter — keep/remove entire groups
df.groupby('dept').filter(
lambda x: x['salary'].mean() > 50000
)
# apply — custom function on each groupdeftop_2(group):
return group.nlargest(2,'salary')
df.groupby('dept').apply(top_2)
# Multiple group keys
df.groupby(['dept','gender'])['salary'].mean()
# 1. Vectorized >> apply >> iterrows
df['double'] = df['score'] * 2# fast# 2. query() faster on large DFs
df.query('age > 25 and score > 80')
# 3. eval() for arithmetic
df.eval('result = score * 2 + age',
inplace=True)
# 4. Chunked reading for huge files
chunks = pd.read_csv('big.csv',
chunksize=10000)
result = pd.concat([process(c) for c in chunks])
# 5. If you must iterate, use itertuplesfor row in df.itertuples():
print(row.name, row.score)