Advanced Missing Data Handling

Definition

Missing data (NaN, None, NaT) requires careful handling. Pandas provides various methods to detect, remove, fill, and interpolate missing values based on different strategies.

Key Concepts

  • Detection: isna(), notna(), isnull(), notnull()
  • Removal: dropna() with various parameters
  • Filling: fillna(), ffill(), bfill()
  • Interpolation: Linear, polynomial, time-based methods
  • Indicators: Add columns to track which values were missing

Example

python

# Create dataset with missing values
data_missing = {
    'Date': pd.date_range('2024-01-01', periods=15, freq='D'),
    'Temperature': [32, 35, np.nan, 38, 40, np.nan, np.nan, 45, 48, 50, np.nan, 52, 55, np.nan, 58],
    'Humidity': [65, np.nan, 70, 72, np.nan, 75, 78, 80, np.nan, np.nan, 85, 87, 90, 92, np.nan],
    'Rainfall': [0, 0, np.nan, 5, 10, np.nan, 0, 0, 15, np.nan, 20, 0, 0, np.nan, 5],
    'City': ['NYC', 'NYC', 'NYC', np.nan, 'NYC', 'LA', 'LA', np.nan, 'LA', 'LA', 'Chicago', 'Chicago', np.nan, 'Chicago', 'Chicago']
}

df_missing = pd.DataFrame(data_missing)
print("DataFrame with missing values:")
print(df_missing)
print("\n")

# ========== DETECTION ==========
# Check for missing values
print("Missing values per column:")
print(df_missing.isna().sum())
print("\n")

print("Percentage of missing values per column:")
print((df_missing.isna().sum() / len(df_missing) * 100).round(2))
print("\n")

# Check if any value is missing in each row
df_missing['Has_Missing'] = df_missing.isna().any(axis=1)
print("Rows with any missing value:")
print(df_missing[df_missing['Has_Missing']])
print("\n")

# ========== REMOVAL ==========
# Drop rows with any missing values
df_dropped_any = df_missing.dropna()
print(f"Rows after dropping any NaN: {len(df_dropped_any)} (from {len(df_missing)})")
print(df_dropped_any)
print("\n")

# Drop rows where all values are missing
df_all_missing = pd.DataFrame({
    'A': [1, np.nan, 3, np.nan],
    'B': [np.nan, np.nan, 6, 7],
    'C': [np.nan, np.nan, 9, 10]
})
print("Before dropping all-NaN rows:")
print(df_all_missing)
df_dropped_all = df_all_missing.dropna(how='all')
print("\nAfter dropping all-NaN rows:")
print(df_dropped_all)
print("\n")

# Drop rows with missing values in specific columns
df_dropped_subset = df_missing.dropna(subset=['Temperature', 'City'])
print(f"Rows after dropping NaN in Temperature and City: {len(df_dropped_subset)}")
print(df_dropped_subset)
print("\n")

# Drop columns with missing values
df_dropped_cols = df_missing.dropna(axis=1)
print("Columns after dropping those with NaN:")
print(df_dropped_cols.columns.tolist())
print("\n")

# Drop columns with more than X% missing
threshold = 0.3  # 30%
df_threshold = df_missing.dropna(thresh=int(threshold * len(df_missing)), axis=1)
print(f"Columns with less than {threshold*100}% missing:")
print(df_threshold.columns.tolist())
print("\n")

# ========== FILLING ==========
# Fill with a constant value
df_filled_constant = df_missing.copy()
df_filled_constant['Temperature'] = df_filled_constant['Temperature'].fillna(0)
print("Fill Temperature with 0:")
print(df_filled_constant[['Date', 'Temperature']])
print("\n")

# Fill with mean/median/mode
df_filled_stats = df_missing.copy()
df_filled_stats['Temperature'] = df_filled_stats['Temperature'].fillna(
    df_filled_stats['Temperature'].mean()
)
df_filled_stats['Humidity'] = df_filled_stats['Humidity'].fillna(
    df_filled_stats['Humidity'].median()
)
print("Fill with mean (Temperature) and median (Humidity):")
print(df_filled_stats[['Temperature', 'Humidity']])
print("\n")

# Forward fill (propagate last valid observation forward)
df_ffill = df_missing.copy()
df_ffill['Temperature'] = df_ffill['Temperature'].ffill()
print("Forward fill Temperature:")
print(df_ffill[['Date', 'Temperature']])
print("\n")

# Backward fill
df_bfill = df_missing.copy()
df_bfill['Temperature'] = df_bfill['Temperature'].bfill()
print("Backward fill Temperature:")
print(df_bfill[['Date', 'Temperature']])
print("\n")

# Fill with limit (only fill certain number of consecutive NaNs)
df_limit = df_missing.copy()
df_limit['Temperature'] = df_limit['Temperature'].ffill(limit=1)
print("Forward fill with limit=1:")
print(df_limit[['Date', 'Temperature']])
print("\n")

# Fill different columns with different values
df_filled_dict = df_missing.copy()
df_filled_dict = df_filled_dict.fillna({
    'Temperature': df_filled_dict['Temperature'].mean(),
    'Humidity': df_filled_dict['Humidity'].median(),
    'Rainfall': 0,
    'City': 'Unknown'
})
print("Fill with dictionary (different values per column):")
print(df_filled_dict)
print("\n")

# ========== INTERPOLATION ==========
# Linear interpolation
df_interp = df_missing.copy()
df_interp['Temperature'] = df_interp['Temperature'].interpolate(method='linear')
print("Linear interpolation:")
print(df_interp[['Date', 'Temperature']])
print("\n")

# Polynomial interpolation
df_interp_poly = df_missing.copy()
df_interp_poly['Temperature'] = df_interp_poly['Temperature'].interpolate(
    method='polynomial',
    order=2
)
print("Polynomial interpolation (order=2):")
print(df_interp_poly[['Date', 'Temperature']].round(2))
print("\n")

# Time-based interpolation
df_time = df_missing.set_index('Date').copy()
df_time['Temperature'] = df_time['Temperature'].interpolate(method='time')
print("Time-based interpolation:")
print(df_time['Temperature'].round(2))
print("\n")

# ========== MISSING DATA INDICATORS ==========
# Create indicator for missing values
df_indicator = df_missing.copy()
df_indicator['Temperature_Was_Missing'] = df_indicator['Temperature'].isna()
df_indicator['Temperature'] = df_indicator['Temperature'].fillna(
    df_indicator['Temperature'].mean()
)
print("With missing data indicator:")
print(df_indicator[['Date', 'Temperature', 'Temperature_Was_Missing']])
print("\n")

# ========== GROUP-BASED FILLING ==========
# Fill missing values with group mean
df_group = df_missing.copy()
df_group['Temperature'] = df_group.groupby('City')['Temperature'].transform(
    lambda x: x.fillna(x.mean())
)
print("Fill with group (City) mean:")
print(df_group[['Date', 'City', 'Temperature']])
print("\n")

# ========== REPLACING SPECIFIC VALUES ==========
# Replace specific values with NaN
df_replace = pd.DataFrame({
    'A': [1, 2, -999, 4, -999],
    'B': [5, -999, 7, 8, 9]
})
print("Before replacing -999 with NaN:")
print(df_replace)
df_replace = df_replace.replace(-999, np.nan)
print("\nAfter replacing:")
print(df_replace)
print("\n")

# ========== ADVANCED: MULTIPLE IMPUTATION CONCEPT ==========
# Simple demonstration of multiple strategies
df_multi_strategy = df_missing.copy()

# Strategy 1: Numerical columns - interpolate
numeric_cols = df_multi_strategy.select_dtypes(include=[np.number]).columns
df_multi_strategy[numeric_cols] = df_multi_strategy[numeric_cols].interpolate()

# Strategy 2: Categorical columns - forward fill
categorical_cols = df_multi_strategy.select_dtypes(include=['object']).columns
df_multi_strategy[categorical_cols] = df_multi_strategy[categorical_cols].ffill()

print("Multi-strategy imputation:")
print(df_multi_strategy)