Pandas & NumPy Cheat Sheet

dataGrades 9-126 sections

NumPy Basics

import numpy as np

// Array creation
arr = np.array([1, 2, 3])
arr = np.arange(10)          # 0-9
arr = np.linspace(0, 1, 5)   # 5 points 0 to 1
arr = np.zeros((3, 4))       # 3x4 zeros
arr = np.ones((2, 3))        # 2x3 ones
arr = np.random.randn(3, 4)  # Normal distribution
arr = np.random.randint(0, 10, (3, 4))  # Random integers
arr = np.eye(3)              # Identity matrix

// Shape & Dimensions
arr.shape                    # (3, 4)
arr.ndim                     # 2
arr.size                     # 12
arr.dtype                    # data type

// Indexing
arr[0]                       # First row
arr[0, 1]                    # Row 0, column 1
arr[1:3]                     # Rows 1-2
arr[:, 2]                    # All rows, column 2
arr[[0, 2]]                  # Rows 0 and 2
arr[arr > 5]                 # Boolean indexing

// Reshaping
arr.reshape((2, 6))
arr.T                        # Transpose
arr.flatten()                # 1D array
arr.ravel()                  # Flatten (view)

// Operations
arr + 1                      # Broadcast scalar
arr * 2
arr + arr                    # Element-wise
np.dot(arr, arr)            # Matrix multiply
np.sqrt(arr)
np.exp(arr)
np.sum(arr)                  # Sum all
np.mean(arr)
np.std(arr)
np.max(arr)
np.min(arr)

// Axis operations
np.sum(arr, axis=0)          # Sum columns
np.sum(arr, axis=1)          # Sum rows
np.mean(arr, axis=1)

Pandas DataFrames

import pandas as pd

// Create DataFrame
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
df = pd.read_csv('file.csv')
df = pd.read_excel('file.xlsx')
df = pd.read_sql('SELECT * FROM table', conn)

// Info
df.shape                     # (3, 2)
df.dtypes
df.info()                    # Summary
df.head()                    # First 5 rows
df.tail()                    # Last 5 rows
df.describe()                # Statistics

// Selection
df['A']                      # Column (Series)
df[['A', 'B']]               # Multiple columns
df.loc[0]                    # Row by label
df.iloc[0]                   # Row by position
df.loc[0, 'A']               # Cell
df.loc[0:2, 'A':'B']         # Slice

// Filtering
df[df['A'] > 2]
df[(df['A'] > 1) & (df['B'] < 6)]
df[df['name'].isin(['Raj', 'Priya'])]
df[df['email'].str.contains('@')]

// Missing data
df.isnull()
df.dropna()                  # Remove NaN rows
df.dropna(axis=1)            # Remove NaN columns
df.fillna(0)                 # Fill with 0
df.fillna(df.mean())         # Fill with mean

// Modification
df['C'] = df['A'] + df['B']  # New column
df['A'] = df['A'] * 2        # Modify column
df.rename(columns={'A': 'X'}) # Rename
df.drop('A', axis=1)         # Drop column
df.drop([0, 1])              # Drop rows
df.astype({'A': 'int32'})    # Change type

Groupby & Aggregation

// Group By
df.groupby('country').size()  # Count per group
df.groupby('country')['salary'].mean()  # Mean salary by country
df.groupby('country')[['salary', 'bonus']].sum()  # Multiple columns

// Agg (flexible aggregation)
df.groupby('country').agg({'salary': 'mean', 'bonus': 'sum'})
df.groupby('country').agg(['mean', 'std', 'min', 'max'])
df.groupby('country').agg(sales_sum=('sales', 'sum'), sales_avg=('sales', 'mean'))

// Transform (return same shape)
df['z_score'] = df.groupby('country')['salary'].transform(lambda x: (x - x.mean()) / x.std())
df['pct_change'] = df.groupby('country')['sales'].transform('pct_change')

// Apply (custom function)
df.groupby('country').apply(lambda x: x['salary'].max() - x['salary'].min())

// Value counts
df['country'].value_counts()
df['country'].value_counts(normalize=True)  # Proportions

// Crosstab (cross-tabulation)
pd.crosstab(df['gender'], df['country'])

Merge & Join

// Merge (SQL-like JOIN)
pd.merge(df1, df2, on='key')  # Inner join
pd.merge(df1, df2, on='key', how='left')    # Left join
pd.merge(df1, df2, on='key', how='right')   # Right join
pd.merge(df1, df2, on='key', how='outer')   # Full join

// Merge on different column names
pd.merge(df1, df2, left_on='id_left', right_on='id_right')

// Merge on index
pd.merge(df1, df2, left_index=True, right_index=True)

// Concat (combine vertically)
pd.concat([df1, df2])        # Stack rows
pd.concat([df1, df2], axis=1)  # Stack columns
pd.concat([df1, df2], ignore_index=True)  # Reset index

// Join (merge on index)
df1.join(df2, lsuffix='_left', rsuffix='_right')

// Append (add rows, deprecated in newer pandas)
df1.append(df2)
# Use: pd.concat([df1, df2]) instead

Reshape & Pivot

// Pivot (wide format)
df.pivot(index='date', columns='country', values='sales')

// Pivot table (with aggregation)
pd.pivot_table(df, values='sales', index='date', columns='country', aggfunc='sum')
pd.pivot_table(df, values=['sales', 'profit'], index='date', columns='country', aggfunc=['sum', 'mean'])

// Melt (long format)
pd.melt(df, id_vars=['id'], value_vars=['A', 'B', 'C'])

// Stack & Unstack
df.stack()     # Long format
df.unstack()   # Wide format

// Transpose
df.T

// Explode (expand lists in cells)
df['tags'].explode()

// Example: Reshape data
# Wide: country | 2022 | 2023 | 2024
# Long: country | year | sales
df_long = df.melt(id_vars=['country'], var_name='year', value_name='sales')

Time Series & Performance

// DateTime
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)

df.resample('D').sum()       # Daily sum
df.resample('M').mean()      # Monthly mean
df.rolling(7).mean()         # 7-day moving average

// Shift (lag/lead)
df['prev'] = df['sales'].shift(1)  # Previous value
df['diff'] = df['sales'] - df['sales'].shift(1)  # Change

// Time-based indexing
df.loc['2024-01-01':'2024-12-31']
df.loc['2024']
df['2024':'2025']

// Extracting components
df['year'] = df.index.year
df['month'] = df.index.month
df['weekday'] = df.index.weekday

// Performance tips
Use categorical for repeated values: df['country'] = df['country'].astype('category')
Use chunking for large files:
for chunk in pd.read_csv('big_file.csv', chunksize=10000):
    process(chunk)

Vectorization: df['A'] + df['B'] faster than df.apply(lambda row: row['A'] + row['B'])
Use eval: df.eval('A + B') for complex expressions
Query: df.query('A > 5 and B < 10') cleaner than df[(df['A'] > 5) & (df['B'] < 10)]

Pandas & NumPy Cheat Sheet

NumPy Basics

Pandas DataFrames

Groupby & Aggregation

Merge & Join

Reshape & Pivot

Time Series & Performance

More Cheat Sheets

SQL Reference Cheat Sheet

Data Visualization Cheat Sheet

Statistics Essentials Cheat Sheet