Definition
A categorical data type is used for variables that take on a limited number of distinct values. Converting to categorical can significantly reduce memory usage and improve performance, especially with large datasets.
Key Concepts
- Memory Efficiency: Categorical data stores values as integers internally
- Ordered Categories: Define order for sorting and comparison
- Category Operations: Add, remove, rename categories
- Performance: Faster groupby and value_counts operations
Example
python
# Create sample data with repetitive values
np.random.seed(42)
size = 100000
data = {
'ID': range(size),
'Country': np.random.choice(['USA', 'UK', 'Canada', 'Australia', 'Germany'], size),
'Product': np.random.choice(['Laptop', 'Phone', 'Tablet', 'Monitor', 'Keyboard'], size),
'Size': np.random.choice(['Small', 'Medium', 'Large', 'XLarge'], size),
'Rating': np.random.choice(['Poor', 'Fair', 'Good', 'Excellent'], size),
'Sales': np.random.randint(100, 1000, size)
}
df_large = pd.DataFrame(data)
print("Original DataFrame info:")
print(df_large.info(memory_usage='deep'))
print("\n")
# Check memory usage before conversion
print("Memory usage before categorical conversion:")
print(df_large.memory_usage(deep=True))
print(f"Total: {df_large.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print("\n")
# Convert to categorical
df_categorical = df_large.copy()
df_categorical['Country'] = df_categorical['Country'].astype('category')
df_categorical['Product'] = df_categorical['Product'].astype('category')
df_categorical['Size'] = df_categorical['Size'].astype('category')
df_categorical['Rating'] = df_categorical['Rating'].astype('category')
print("Memory usage after categorical conversion:")
print(df_categorical.memory_usage(deep=True))
print(f"Total: {df_categorical.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print("\n")
# Calculate memory savings
original_memory = df_large.memory_usage(deep=True).sum()
categorical_memory = df_categorical.memory_usage(deep=True).sum()
savings = (1 - categorical_memory / original_memory) * 100
print(f"Memory savings: {savings:.2f}%")
print("\n")
# Working with smaller dataset for demonstration
df_small = df_large.head(20).copy()
# Create ordered categorical
size_order = ['Small', 'Medium', 'Large', 'XLarge']
df_small['Size_Ordered'] = pd.Categorical(
df_small['Size'],
categories=size_order,
ordered=True
)
print("Ordered categorical:")
print(df_small[['Size', 'Size_Ordered']].head(10))
print(f"Is ordered: {df_small['Size_Ordered'].cat.ordered}")
print("\n")
# Sorting with ordered categorical
df_sorted = df_small.sort_values('Size_Ordered')
print("Sorted by Size (ordered):")
print(df_sorted[['ID', 'Size_Ordered']].head(10))
print("\n")
# Access category properties
print("Categories:", df_small['Size_Ordered'].cat.categories)
print("Codes (internal representation):", df_small['Size_Ordered'].cat.codes[:10].tolist())
print("\n")
# Add new category
df_small['Size_Ordered'] = df_small['Size_Ordered'].cat.add_categories(['XXLarge'])
print("Categories after adding XXLarge:", df_small['Size_Ordered'].cat.categories)
print("\n")
# Remove category
df_small['Size_Ordered'] = df_small['Size_Ordered'].cat.remove_categories(['XXLarge'])
print("Categories after removing XXLarge:", df_small['Size_Ordered'].cat.categories)
print("\n")
# Rename categories
rating_cat = pd.Categorical(df_small['Rating'])
df_small['Rating_Cat'] = rating_cat
df_small['Rating_Cat'] = df_small['Rating_Cat'].cat.rename_categories({
'Poor': '1-Poor',
'Fair': '2-Fair',
'Good': '3-Good',
'Excellent': '4-Excellent'
})
print("Renamed categories:")
print(df_small[['Rating', 'Rating_Cat']].head(10))
print("\n")
# Reorder categories
df_small['Rating_Cat'] = df_small['Rating_Cat'].cat.reorder_categories([
'1-Poor', '2-Fair', '3-Good', '4-Excellent'
], ordered=True)
print("Reordered categories:")
print(df_small['Rating_Cat'].cat.categories)
print("\n")
# Value counts on categorical (faster)
print("Value counts on categorical data:")
print(df_categorical['Country'].value_counts())
print("\n")
# GroupBy on categorical (faster)
print("GroupBy on categorical data:")
print(df_categorical.groupby('Country')['Sales'].mean().head())
print("\n")
# Convert multiple columns at once
df_multi_cat = df_large.head(1000).copy()
categorical_columns = ['Country', 'Product', 'Size', 'Rating']
df_multi_cat[categorical_columns] = df_multi_cat[categorical_columns].astype('category')
print("Multiple columns converted:")
print(df_multi_cat.dtypes)
