Definition
Missing data (NaN, None, NaT) requires careful handling. Pandas provides various methods to detect, remove, fill, and interpolate missing values based on different strategies.
Key Concepts
- Detection: isna(), notna(), isnull(), notnull()
- Removal: dropna() with various parameters
- Filling: fillna(), ffill(), bfill()
- Interpolation: Linear, polynomial, time-based methods
- Indicators: Add columns to track which values were missing
Example
python
# Create dataset with missing values
data_missing = {
'Date': pd.date_range('2024-01-01', periods=15, freq='D'),
'Temperature': [32, 35, np.nan, 38, 40, np.nan, np.nan, 45, 48, 50, np.nan, 52, 55, np.nan, 58],
'Humidity': [65, np.nan, 70, 72, np.nan, 75, 78, 80, np.nan, np.nan, 85, 87, 90, 92, np.nan],
'Rainfall': [0, 0, np.nan, 5, 10, np.nan, 0, 0, 15, np.nan, 20, 0, 0, np.nan, 5],
'City': ['NYC', 'NYC', 'NYC', np.nan, 'NYC', 'LA', 'LA', np.nan, 'LA', 'LA', 'Chicago', 'Chicago', np.nan, 'Chicago', 'Chicago']
}
df_missing = pd.DataFrame(data_missing)
print("DataFrame with missing values:")
print(df_missing)
print("\n")
# ========== DETECTION ==========
# Check for missing values
print("Missing values per column:")
print(df_missing.isna().sum())
print("\n")
print("Percentage of missing values per column:")
print((df_missing.isna().sum() / len(df_missing) * 100).round(2))
print("\n")
# Check if any value is missing in each row
df_missing['Has_Missing'] = df_missing.isna().any(axis=1)
print("Rows with any missing value:")
print(df_missing[df_missing['Has_Missing']])
print("\n")
# ========== REMOVAL ==========
# Drop rows with any missing values
df_dropped_any = df_missing.dropna()
print(f"Rows after dropping any NaN: {len(df_dropped_any)} (from {len(df_missing)})")
print(df_dropped_any)
print("\n")
# Drop rows where all values are missing
df_all_missing = pd.DataFrame({
'A': [1, np.nan, 3, np.nan],
'B': [np.nan, np.nan, 6, 7],
'C': [np.nan, np.nan, 9, 10]
})
print("Before dropping all-NaN rows:")
print(df_all_missing)
df_dropped_all = df_all_missing.dropna(how='all')
print("\nAfter dropping all-NaN rows:")
print(df_dropped_all)
print("\n")
# Drop rows with missing values in specific columns
df_dropped_subset = df_missing.dropna(subset=['Temperature', 'City'])
print(f"Rows after dropping NaN in Temperature and City: {len(df_dropped_subset)}")
print(df_dropped_subset)
print("\n")
# Drop columns with missing values
df_dropped_cols = df_missing.dropna(axis=1)
print("Columns after dropping those with NaN:")
print(df_dropped_cols.columns.tolist())
print("\n")
# Drop columns with more than X% missing
threshold = 0.3 # 30%
df_threshold = df_missing.dropna(thresh=int(threshold * len(df_missing)), axis=1)
print(f"Columns with less than {threshold*100}% missing:")
print(df_threshold.columns.tolist())
print("\n")
# ========== FILLING ==========
# Fill with a constant value
df_filled_constant = df_missing.copy()
df_filled_constant['Temperature'] = df_filled_constant['Temperature'].fillna(0)
print("Fill Temperature with 0:")
print(df_filled_constant[['Date', 'Temperature']])
print("\n")
# Fill with mean/median/mode
df_filled_stats = df_missing.copy()
df_filled_stats['Temperature'] = df_filled_stats['Temperature'].fillna(
df_filled_stats['Temperature'].mean()
)
df_filled_stats['Humidity'] = df_filled_stats['Humidity'].fillna(
df_filled_stats['Humidity'].median()
)
print("Fill with mean (Temperature) and median (Humidity):")
print(df_filled_stats[['Temperature', 'Humidity']])
print("\n")
# Forward fill (propagate last valid observation forward)
df_ffill = df_missing.copy()
df_ffill['Temperature'] = df_ffill['Temperature'].ffill()
print("Forward fill Temperature:")
print(df_ffill[['Date', 'Temperature']])
print("\n")
# Backward fill
df_bfill = df_missing.copy()
df_bfill['Temperature'] = df_bfill['Temperature'].bfill()
print("Backward fill Temperature:")
print(df_bfill[['Date', 'Temperature']])
print("\n")
# Fill with limit (only fill certain number of consecutive NaNs)
df_limit = df_missing.copy()
df_limit['Temperature'] = df_limit['Temperature'].ffill(limit=1)
print("Forward fill with limit=1:")
print(df_limit[['Date', 'Temperature']])
print("\n")
# Fill different columns with different values
df_filled_dict = df_missing.copy()
df_filled_dict = df_filled_dict.fillna({
'Temperature': df_filled_dict['Temperature'].mean(),
'Humidity': df_filled_dict['Humidity'].median(),
'Rainfall': 0,
'City': 'Unknown'
})
print("Fill with dictionary (different values per column):")
print(df_filled_dict)
print("\n")
# ========== INTERPOLATION ==========
# Linear interpolation
df_interp = df_missing.copy()
df_interp['Temperature'] = df_interp['Temperature'].interpolate(method='linear')
print("Linear interpolation:")
print(df_interp[['Date', 'Temperature']])
print("\n")
# Polynomial interpolation
df_interp_poly = df_missing.copy()
df_interp_poly['Temperature'] = df_interp_poly['Temperature'].interpolate(
method='polynomial',
order=2
)
print("Polynomial interpolation (order=2):")
print(df_interp_poly[['Date', 'Temperature']].round(2))
print("\n")
# Time-based interpolation
df_time = df_missing.set_index('Date').copy()
df_time['Temperature'] = df_time['Temperature'].interpolate(method='time')
print("Time-based interpolation:")
print(df_time['Temperature'].round(2))
print("\n")
# ========== MISSING DATA INDICATORS ==========
# Create indicator for missing values
df_indicator = df_missing.copy()
df_indicator['Temperature_Was_Missing'] = df_indicator['Temperature'].isna()
df_indicator['Temperature'] = df_indicator['Temperature'].fillna(
df_indicator['Temperature'].mean()
)
print("With missing data indicator:")
print(df_indicator[['Date', 'Temperature', 'Temperature_Was_Missing']])
print("\n")
# ========== GROUP-BASED FILLING ==========
# Fill missing values with group mean
df_group = df_missing.copy()
df_group['Temperature'] = df_group.groupby('City')['Temperature'].transform(
lambda x: x.fillna(x.mean())
)
print("Fill with group (City) mean:")
print(df_group[['Date', 'City', 'Temperature']])
print("\n")
# ========== REPLACING SPECIFIC VALUES ==========
# Replace specific values with NaN
df_replace = pd.DataFrame({
'A': [1, 2, -999, 4, -999],
'B': [5, -999, 7, 8, 9]
})
print("Before replacing -999 with NaN:")
print(df_replace)
df_replace = df_replace.replace(-999, np.nan)
print("\nAfter replacing:")
print(df_replace)
print("\n")
# ========== ADVANCED: MULTIPLE IMPUTATION CONCEPT ==========
# Simple demonstration of multiple strategies
df_multi_strategy = df_missing.copy()
# Strategy 1: Numerical columns - interpolate
numeric_cols = df_multi_strategy.select_dtypes(include=[np.number]).columns
df_multi_strategy[numeric_cols] = df_multi_strategy[numeric_cols].interpolate()
# Strategy 2: Categorical columns - forward fill
categorical_cols = df_multi_strategy.select_dtypes(include=['object']).columns
df_multi_strategy[categorical_cols] = df_multi_strategy[categorical_cols].ffill()
print("Multi-strategy imputation:")
print(df_multi_strategy)
