import numpy as np
// Array creation
arr = np.array([1, 2, 3])
arr = np.arange(10) # 0-9
arr = np.linspace(0, 1, 5) # 5 points 0 to 1
arr = np.zeros((3, 4)) # 3x4 zeros
arr = np.ones((2, 3)) # 2x3 ones
arr = np.random.randn(3, 4) # Normal distribution
arr = np.random.randint(0, 10, (3, 4)) # Random integers
arr = np.eye(3) # Identity matrix
// Shape & Dimensions
arr.shape # (3, 4)
arr.ndim # 2
arr.size # 12
arr.dtype # data type
// Indexing
arr[0] # First row
arr[0, 1] # Row 0, column 1
arr[1:3] # Rows 1-2
arr[:, 2] # All rows, column 2
arr[[0, 2]] # Rows 0 and 2
arr[arr > 5] # Boolean indexing
// Reshaping
arr.reshape((2, 6))
arr.T # Transpose
arr.flatten() # 1D array
arr.ravel() # Flatten (view)
// Operations
arr + 1 # Broadcast scalar
arr * 2
arr + arr # Element-wise
np.dot(arr, arr) # Matrix multiply
np.sqrt(arr)
np.exp(arr)
np.sum(arr) # Sum all
np.mean(arr)
np.std(arr)
np.max(arr)
np.min(arr)
// Axis operations
np.sum(arr, axis=0) # Sum columns
np.sum(arr, axis=1) # Sum rows
np.mean(arr, axis=1)
Pandas & NumPy Cheat Sheet
NumPy Basics
Pandas DataFrames
import pandas as pd
// Create DataFrame
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
df = pd.read_csv('file.csv')
df = pd.read_excel('file.xlsx')
df = pd.read_sql('SELECT * FROM table', conn)
// Info
df.shape # (3, 2)
df.dtypes
df.info() # Summary
df.head() # First 5 rows
df.tail() # Last 5 rows
df.describe() # Statistics
// Selection
df['A'] # Column (Series)
df[['A', 'B']] # Multiple columns
df.loc[0] # Row by label
df.iloc[0] # Row by position
df.loc[0, 'A'] # Cell
df.loc[0:2, 'A':'B'] # Slice
// Filtering
df[df['A'] > 2]
df[(df['A'] > 1) & (df['B'] < 6)]
df[df['name'].isin(['Raj', 'Priya'])]
df[df['email'].str.contains('@')]
// Missing data
df.isnull()
df.dropna() # Remove NaN rows
df.dropna(axis=1) # Remove NaN columns
df.fillna(0) # Fill with 0
df.fillna(df.mean()) # Fill with mean
// Modification
df['C'] = df['A'] + df['B'] # New column
df['A'] = df['A'] * 2 # Modify column
df.rename(columns={'A': 'X'}) # Rename
df.drop('A', axis=1) # Drop column
df.drop([0, 1]) # Drop rows
df.astype({'A': 'int32'}) # Change type
Groupby & Aggregation
// Group By
df.groupby('country').size() # Count per group
df.groupby('country')['salary'].mean() # Mean salary by country
df.groupby('country')[['salary', 'bonus']].sum() # Multiple columns
// Agg (flexible aggregation)
df.groupby('country').agg({'salary': 'mean', 'bonus': 'sum'})
df.groupby('country').agg(['mean', 'std', 'min', 'max'])
df.groupby('country').agg(sales_sum=('sales', 'sum'), sales_avg=('sales', 'mean'))
// Transform (return same shape)
df['z_score'] = df.groupby('country')['salary'].transform(lambda x: (x - x.mean()) / x.std())
df['pct_change'] = df.groupby('country')['sales'].transform('pct_change')
// Apply (custom function)
df.groupby('country').apply(lambda x: x['salary'].max() - x['salary'].min())
// Value counts
df['country'].value_counts()
df['country'].value_counts(normalize=True) # Proportions
// Crosstab (cross-tabulation)
pd.crosstab(df['gender'], df['country'])
Merge & Join
// Merge (SQL-like JOIN)
pd.merge(df1, df2, on='key') # Inner join
pd.merge(df1, df2, on='key', how='left') # Left join
pd.merge(df1, df2, on='key', how='right') # Right join
pd.merge(df1, df2, on='key', how='outer') # Full join
// Merge on different column names
pd.merge(df1, df2, left_on='id_left', right_on='id_right')
// Merge on index
pd.merge(df1, df2, left_index=True, right_index=True)
// Concat (combine vertically)
pd.concat([df1, df2]) # Stack rows
pd.concat([df1, df2], axis=1) # Stack columns
pd.concat([df1, df2], ignore_index=True) # Reset index
// Join (merge on index)
df1.join(df2, lsuffix='_left', rsuffix='_right')
// Append (add rows, deprecated in newer pandas)
df1.append(df2)
# Use: pd.concat([df1, df2]) instead
Reshape & Pivot
// Pivot (wide format)
df.pivot(index='date', columns='country', values='sales')
// Pivot table (with aggregation)
pd.pivot_table(df, values='sales', index='date', columns='country', aggfunc='sum')
pd.pivot_table(df, values=['sales', 'profit'], index='date', columns='country', aggfunc=['sum', 'mean'])
// Melt (long format)
pd.melt(df, id_vars=['id'], value_vars=['A', 'B', 'C'])
// Stack & Unstack
df.stack() # Long format
df.unstack() # Wide format
// Transpose
df.T
// Explode (expand lists in cells)
df['tags'].explode()
// Example: Reshape data
# Wide: country | 2022 | 2023 | 2024
# Long: country | year | sales
df_long = df.melt(id_vars=['country'], var_name='year', value_name='sales')
Time Series & Performance
// DateTime
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)
df.resample('D').sum() # Daily sum
df.resample('M').mean() # Monthly mean
df.rolling(7).mean() # 7-day moving average
// Shift (lag/lead)
df['prev'] = df['sales'].shift(1) # Previous value
df['diff'] = df['sales'] - df['sales'].shift(1) # Change
// Time-based indexing
df.loc['2024-01-01':'2024-12-31']
df.loc['2024']
df['2024':'2025']
// Extracting components
df['year'] = df.index.year
df['month'] = df.index.month
df['weekday'] = df.index.weekday
// Performance tips
Use categorical for repeated values: df['country'] = df['country'].astype('category')
Use chunking for large files:
for chunk in pd.read_csv('big_file.csv', chunksize=10000):
process(chunk)
Vectorization: df['A'] + df['B'] faster than df.apply(lambda row: row['A'] + row['B'])
Use eval: df.eval('A + B') for complex expressions
Query: df.query('A > 5 and B < 10') cleaner than df[(df['A'] > 5) & (df['B'] < 10)]