Pandas 进阶语法详解
目录
前言
本文是 Pandas 进阶教程,重点讲解以下高级功能:
- 缺失值处理:识别、填充、删除缺失值
- 数据分组:GroupBy 的强大功能
- 数据合并:多种合并方式和技巧
- 交叉表:快速生成交叉统计表
- 透视表:灵活的数据透视分析
环境准备
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
# 设置显示选项
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50)
print(f"Pandas 版本: {pd.__version__}")
print(f"NumPy 版本: {np.__version__}")
创建测试数据集
为了方便学习和测试,我们先创建一些测试数据:
# 创建员工数据
np.random.seed(42)
n_employees = 100
employees_data = {
'employee_id': range(1, n_employees + 1),
'name': [f'员工{i}' for i in range(1, n_employees + 1)],
'department': np.random.choice(['销售部', '技术部', '市场部', '人事部', '财务部'], n_employees),
'position': np.random.choice(['经理', '主管', '专员', '助理'], n_employees),
'age': np.random.randint(22, 55, n_employees),
'salary': np.random.randint(5000, 30000, n_employees),
'bonus': np.random.choice([None, 1000, 2000, 3000, 5000], n_employees, p=[0.2, 0.3, 0.3, 0.15, 0.05]),
'join_date': pd.date_range('2020-01-01', periods=n_employees, freq='D'),
'performance_score': np.random.choice([None, 60, 70, 80, 90, 95], n_employees, p=[0.1, 0.1, 0.2, 0.3, 0.2, 0.1])
}
df_employees = pd.DataFrame(employees_data)
# 创建销售数据
sales_data = {
'sale_id': range(1, 201),
'employee_id': np.random.randint(1, n_employees + 1, 200),
'product': np.random.choice(['产品A', '产品B', '产品C', '产品D'], 200),
'quantity': np.random.randint(1, 100, 200),
'unit_price': np.random.choice([100, 200, 300, 500], 200),
'sale_date': pd.date_range('2024-01-01', periods=200, freq='D'),
'region': np.random.choice(['华北', '华东', '华南', '西南'], 200),
'discount': np.random.choice([None, 0.05, 0.1, 0.15, 0.2], 200, p=[0.3, 0.2, 0.3, 0.15, 0.05])
}
df_sales = pd.DataFrame(sales_data)
# 计算总金额(考虑折扣)
df_sales['total_amount'] = df_sales['quantity'] * df_sales['unit_price'] * (1 - df_sales['discount'].fillna(0))
print("=" * 60)
print("员工数据预览")
print("=" * 60)
print(df_employees.head())
print(f"\n数据形状: {df_employees.shape}")
print(f"\n缺失值统计:")
print(df_employees.isnull().sum())
print("\n" + "=" * 60)
print("销售数据预览")
print("=" * 60)
print(df_sales.head())
print(f"\n数据形状: {df_sales.shape}")
缺失值处理
缺失值是数据分析中常见的问题,Pandas 提供了丰富的工具来处理缺失值。
1. 检测缺失值
# 检查缺失值
print("=" * 60)
print("1. 缺失值检测")
print("=" * 60)
# 方法1:isnull() 或 isna() - 返回布尔 DataFrame
print("\n缺失值位置(前10行):")
print(df_employees.isnull().head(10))
# 方法2:统计每列的缺失值数量
print("\n各列缺失值数量:")
print(df_employees.isnull().sum())
# 方法3:统计每列的缺失值比例
print("\n各列缺失值比例:")
print(df_employees.isnull().sum() / len(df_employees) * 100)
# 方法4:检查是否有缺失值
print(f"\n是否有缺失值: {df_employees.isnull().any().any()}")
print(f"缺失值总数: {df_employees.isnull().sum().sum()}")
# 方法5:notna() 或 notnull() - 检查非缺失值
print("\n非缺失值统计:")
print(df_employees.notna().sum())
2. 删除缺失值
print("=" * 60)
print("2. 删除缺失值")
print("=" * 60)
# 方法1:dropna() - 删除包含缺失值的行
print("\n原始数据形状:", df_employees.shape)
# 删除任何列包含缺失值的行
df_drop_any = df_employees.dropna()
print(f"删除任何缺失值后的形状: {df_drop_any.shape}")
# 删除所有列都包含缺失值的行
df_drop_all = df_employees.dropna(how='all')
print(f"删除全部缺失值后的形状: {df_drop_all.shape}")
# 删除指定列包含缺失值的行
df_drop_bonus = df_employees.dropna(subset=['bonus'])
print(f"删除 bonus 列缺失值后的形状: {df_drop_bonus.shape}")
# 删除指定列全部缺失的行
df_drop_thresh = df_employees.dropna(thresh=7) # 至少7个非缺失值
print(f"至少7个非缺失值的行数: {df_drop_thresh.shape}")
# 删除包含缺失值的列
df_drop_cols = df_employees.dropna(axis=1)
print(f"删除包含缺失值的列后的形状: {df_drop_cols.shape}")
3. 填充缺失值
print("=" * 60)
print("3. 填充缺失值")
print("=" * 60)
# 方法1:使用固定值填充
df_fill_0 = df_employees.copy()
df_fill_0['bonus'] = df_fill_0['bonus'].fillna(0)
print("\n使用 0 填充 bonus 列:")
print(df_fill_0['bonus'].value_counts().head())
# 方法2:使用前向填充(forward fill)
df_fill_ffill = df_employees.copy()
df_fill_ffill['bonus'] = df_fill_ffill['bonus'].fillna(method='ffill')
print("\n前向填充 bonus 列(前5个缺失值):")
print(df_fill_ffill[df_employees['bonus'].isnull()]['bonus'].head())
# 方法3:使用后向填充(backward fill)
df_fill_bfill = df_employees.copy()
df_fill_bfill['bonus'] = df_fill_bfill['bonus'].fillna(method='bfill')
print("\n后向填充 bonus 列(前5个缺失值):")
print(df_fill_bfill[df_employees['bonus'].isnull()]['bonus'].head())
# 方法4:使用统计值填充
df_fill_mean = df_employees.copy()
df_fill_mean['bonus'] = df_fill_mean['bonus'].fillna(df_fill_mean['bonus'].mean())
print(f"\n使用均值填充 bonus: {df_fill_mean['bonus'].mean():.2f}")
df_fill_median = df_employees.copy()
df_fill_median['bonus'] = df_fill_median['bonus'].fillna(df_fill_median['bonus'].median())
print(f"使用中位数填充 bonus: {df_fill_median['bonus'].median():.2f}")
df_fill_mode = df_employees.copy()
df_fill_mode['bonus'] = df_fill_mode['bonus'].fillna(df_fill_mode['bonus'].mode()[0])
print(f"使用众数填充 bonus: {df_fill_mode['bonus'].mode()[0]}")
# 方法5:使用插值填充
df_fill_interpolate = df_employees.copy()
df_fill_interpolate['performance_score'] = df_fill_interpolate['performance_score'].interpolate(method='linear')
print("\n使用线性插值填充 performance_score:")
print(df_fill_interpolate['performance_score'].describe())
# 方法6:按分组填充(使用分组均值)
df_fill_group = df_employees.copy()
df_fill_group['bonus'] = df_fill_group.groupby('department')['bonus'].transform(
lambda x: x.fillna(x.mean())
)
print("\n按部门分组填充 bonus:")
print(df_fill_group.groupby('department')['bonus'].mean())
4. 高级缺失值处理
print("=" * 60)
print("4. 高级缺失值处理")
print("=" * 60)
# 方法1:使用字典指定不同列的填充值
fill_dict = {
'bonus': 0,
'performance_score': df_employees['performance_score'].median()
}
df_fill_dict = df_employees.copy().fillna(fill_dict)
print("\n使用字典填充:")
print(df_fill_dict.isnull().sum())
# 方法2:限制填充范围(只填充连续缺失值)
df_fill_limit = df_employees.copy()
df_fill_limit['bonus'] = df_fill_limit['bonus'].fillna(0, limit=2)
print("\n限制填充数量(最多填充2个):")
print(f"bonus 列缺失值: {df_fill_limit['bonus'].isnull().sum()}")
# 方法3:使用条件填充
df_fill_condition = df_employees.copy()
# 根据部门填充不同的默认奖金
dept_bonus = {
'销售部': 2000,
'技术部': 3000,
'市场部': 1500,
'人事部': 1000,
'财务部': 1200
}
df_fill_condition['bonus'] = df_fill_condition.apply(
lambda row: dept_bonus[row['department']] if pd.isna(row['bonus']) else row['bonus'],
axis=1
)
print("\n按部门条件填充:")
print(df_fill_condition.groupby('department')['bonus'].mean())
5. 缺失值处理实战案例
print("=" * 60)
print("5. 缺失值处理实战案例")
print("=" * 60)
def clean_employee_data(df):
"""清理员工数据的缺失值"""
df_clean = df.copy()
# 1. 删除关键字段缺失的记录(如姓名)
df_clean = df_clean.dropna(subset=['name'])
# 2. 填充奖金:按部门的中位数填充
df_clean['bonus'] = df_clean.groupby('department')['bonus'].transform(
lambda x: x.fillna(x.median() if not x.isna().all() else 0)
)
# 3. 填充绩效分数:使用线性插值
df_clean['performance_score'] = df_clean['performance_score'].interpolate(method='linear')
df_clean['performance_score'] = df_clean['performance_score'].fillna(
df_clean['performance_score'].median()
)
return df_clean
df_cleaned = clean_employee_data(df_employees)
print("\n清理后的缺失值统计:")
print(df_cleaned.isnull().sum())
print(f"\n清理前记录数: {len(df_employees)}")
print(f"清理后记录数: {len(df_cleaned)}")
数据分组(GroupBy)
GroupBy 是 Pandas 最强大的功能之一,可以对数据进行分组并应用各种聚合操作。
1. 基本分组操作
print("=" * 60)
print("1. 基本分组操作")
print("=" * 60)
# 方法1:按单列分组
grouped_dept = df_employees.groupby('department')
print("\n按部门分组:")
print(f"分组数量: {len(grouped_dept)}")
print(f"分组名称: {grouped_dept.groups.keys()}")
# 查看每个分组的大小
print("\n各分组大小:")
print(grouped_dept.size())
# 方法2:按多列分组
grouped_multi = df_employees.groupby(['department', 'position'])
print("\n按部门和职位分组:")
print(f"分组数量: {len(grouped_multi)}")
print("\n各分组大小(前10个):")
print(grouped_multi.size().head(10))
# 方法3:遍历分组
print("\n遍历分组(前3个部门):")
for name, group in list(grouped_dept)[:3]:
print(f"\n{name} 部门:")
print(group[['name', 'position', 'salary']].head(3))
2. 聚合函数
print("=" * 60)
print("2. 聚合函数")
print("=" * 60)
# 方法1:基本聚合函数
print("\n按部门统计平均薪资:")
print(grouped_dept['salary'].mean())
print("\n按部门统计多个指标:")
agg_result = grouped_dept.agg({
'salary': ['mean', 'median', 'std', 'min', 'max'],
'age': ['mean', 'min', 'max'],
'bonus': 'sum'
})
print(agg_result)
# 方法2:自定义聚合函数
def salary_range(x):
"""计算薪资范围"""
return x.max() - x.min()
print("\n自定义聚合函数(薪资范围):")
print(grouped_dept['salary'].apply(salary_range))
# 方法3:多个聚合函数
print("\n多个聚合函数:")
print(grouped_dept['salary'].agg(['mean', 'std', 'count']))
# 方法4:命名聚合(Pandas 0.25+)
agg_named = grouped_dept.agg(
avg_salary=('salary', 'mean'),
max_salary=('salary', 'max'),
total_bonus=('bonus', 'sum'),
employee_count=('employee_id', 'count')
)
print("\n命名聚合结果:")
print(agg_named)
3. 分组转换(Transform)
print("=" * 60)
print("3. 分组转换(Transform)")
print("=" * 60)
# Transform 返回与原始数据相同大小的结果
df_transform = df_employees.copy()
# 计算每个员工相对于部门平均薪资的差值
df_transform['dept_avg_salary'] = df_transform.groupby('department')['salary'].transform('mean')
df_transform['salary_diff'] = df_transform['salary'] - df_transform['dept_avg_salary']
print("\n薪资与部门平均值的差值(前10行):")
print(df_transform[['name', 'department', 'salary', 'dept_avg_salary', 'salary_diff']].head(10))
# 标准化:计算 Z-score
df_transform['salary_zscore'] = df_transform.groupby('department')['salary'].transform(
lambda x: (x - x.mean()) / x.std()
)
print("\n薪资标准化(Z-score):")
print(df_transform[['name', 'department', 'salary', 'salary_zscore']].head(10))
4. 分组过滤(Filter)
print("=" * 60)
print("4. 分组过滤(Filter)")
print("=" * 60)
# 过滤:只保留员工数大于15的部门
df_filtered = df_employees.groupby('department').filter(lambda x: len(x) > 15)
print("\n过滤后的部门:")
print(df_filtered['department'].value_counts())
# 过滤:只保留平均薪资大于15000的部门
df_filtered_salary = df_employees.groupby('department').filter(
lambda x: x['salary'].mean() > 15000
)
print("\n平均薪资大于15000的部门:")
print(df_filtered_salary['department'].unique())
5. 分组应用(Apply)
print("=" * 60)
print("5. 分组应用(Apply)")
print("=" * 60)
# 方法1:对每个分组应用函数
def top_earners(group, n=3):
"""返回每个部门薪资最高的n个员工"""
return group.nlargest(n, 'salary')[['name', 'salary']]
print("\n各部门薪资最高的3名员工:")
top_salaries = df_employees.groupby('department').apply(top_earners, n=3)
print(top_salaries)
# 方法2:返回标量值
def dept_stats(group):
"""计算部门统计信息"""
return pd.Series({
'count': len(group),
'avg_salary': group['salary'].mean(),
'avg_age': group['age'].mean(),
'total_bonus': group['bonus'].sum()
})
print("\n各部门统计信息:")
dept_statistics = df_employees.groupby('department').apply(dept_stats)
print(dept_statistics)
6. 分组迭代和高级操作
print("=" * 60)
print("6. 分组迭代和高级操作")
print("=" * 60)
# 方法1:获取特定分组
sales_dept = grouped_dept.get_group('销售部')
print("\n销售部员工信息(前5行):")
print(sales_dept.head())
# 方法2:分组后重置索引
grouped_reset = df_employees.groupby('department', as_index=False).agg({
'salary': 'mean',
'age': 'mean'
})
print("\n分组聚合后重置索引:")
print(grouped_reset.head())
# 方法3:分组排序
print("\n各部门内按薪资排序(前3个部门):")
for name, group in list(grouped_dept)[:3]:
print(f"\n{name} 部门(按薪资降序):")
print(group.nlargest(3, 'salary')[['name', 'salary']])
数据合并(Merge & Join)
Pandas 提供了多种数据合并方式,类似于 SQL 的 JOIN 操作。
1. Merge 基本操作
print("=" * 60)
print("1. Merge 基本操作")
print("=" * 60)
# 准备合并用的部门信息表
df_dept_info = pd.DataFrame({
'department': ['销售部', '技术部', '市场部', '人事部', '财务部'],
'dept_manager': ['张经理', '李经理', '王经理', '赵经理', '刘经理'],
'budget': [500000, 800000, 300000, 200000, 400000],
'location': ['A座3楼', 'B座5楼', 'A座2楼', 'A座1楼', 'B座2楼']
})
print("\n部门信息表:")
print(df_dept_info)
# 内连接(Inner Join)- 默认
df_inner = pd.merge(df_employees, df_dept_info, on='department', how='inner')
print("\n内连接结果(前5行):")
print(df_inner[['name', 'department', 'dept_manager', 'budget']].head())
# 左连接(Left Join)
df_left = pd.merge(df_employees, df_dept_info, on='department', how='left')
print(f"\n左连接: 原始{len(df_employees)}行 -> 合并后{len(df_left)}行")
# 右连接(Right Join)
df_right = pd.merge(df_employees, df_dept_info, on='department', how='right')
print(f"右连接: 原始{len(df_dept_info)}行 -> 合并后{len(df_right)}行")
# 外连接(Outer Join)
df_outer = pd.merge(df_employees, df_dept_info, on='department', how='outer')
print(f"外连接: 合并后{len(df_outer)}行")
2. 不同列名的合并
print("=" * 60)
print("2. 不同列名的合并")
print("=" * 60)
# 创建员工销售汇总表
df_employee_sales = df_sales.groupby('employee_id').agg({
'total_amount': 'sum',
'quantity': 'sum',
'sale_id': 'count'
}).reset_index()
df_employee_sales.columns = ['employee_id', 'total_sales', 'total_quantity', 'sale_count']
print("\n员工销售汇总(前5行):")
print(df_employee_sales.head())
# 使用 left_on 和 right_on
df_merged = pd.merge(
df_employees,
df_employee_sales,
left_on='employee_id',
right_on='employee_id',
how='left'
)
print("\n合并员工信息和销售数据(前5行):")
print(df_merged[['name', 'department', 'total_sales', 'sale_count']].head())
3. 多键合并
print("=" * 60)
print("3. 多键合并")
print("=" * 60)
# 创建按部门和日期汇总的销售数据
df_dept_sales = df_sales.merge(
df_employees[['employee_id', 'department']],
on='employee_id',
how='left'
).groupby(['department', 'sale_date']).agg({
'total_amount': 'sum',
'quantity': 'sum'
}).reset_index()
print("\n按部门和日期的销售汇总(前5行):")
print(df_dept_sales.head())
# 多键合并
df_multi_key = pd.merge(
df_employees,
df_dept_sales,
on=['department'],
how='left',
suffixes=('_emp', '_sales')
)
print("\n多键合并结果(前5行):")
print(df_multi_key[['name', 'department', 'total_amount']].head())
4. Join 操作
print("=" * 60)
print("4. Join 操作")
print("=" * 60)
# 设置索引后使用 join
df_emp_indexed = df_employees.set_index('employee_id')
df_sales_indexed = df_sales.set_index('employee_id')
# 左连接
df_joined = df_emp_indexed.join(
df_sales_indexed[['total_amount', 'sale_date']],
how='left'
)
print("\n使用 join 合并(前5行):")
print(df_joined[['name', 'department', 'total_amount']].head())
# 多个 DataFrame 连接
df_sales_summary = df_sales.groupby('employee_id')['total_amount'].sum().to_frame('total_sales')
df_sales_count = df_sales.groupby('employee_id')['sale_id'].count().to_frame('sale_count')
df_multi_join = df_emp_indexed.join([df_sales_summary, df_sales_count], how='left')
print("\n多表连接(前5行):")
print(df_multi_join[['name', 'department', 'total_sales', 'sale_count']].head())
5. Concatenate 操作
print("=" * 60)
print("5. Concatenate 操作")
print("=" * 60)
# 创建两个相似结构的 DataFrame
df_q1 = df_sales[df_sales['sale_date'] <= '2024-03-31'].copy()
df_q1['quarter'] = 'Q1'
df_q2 = df_sales[(df_sales['sale_date'] > '2024-03-31') &
(df_sales['sale_date'] <= '2024-06-30')].copy()
df_q2['quarter'] = 'Q2'
# 垂直拼接
df_quarters = pd.concat([df_q1, df_q2], ignore_index=True)
print("\n垂直拼接结果:")
print(f"Q1 记录数: {len(df_q1)}")
print(f"Q2 记录数: {len(df_q2)}")
print(f"合并后记录数: {len(df_quarters)}")
print(f"季度分布:\n{df_quarters['quarter'].value_counts()}")
# 水平拼接
df_horizontal = pd.concat([df_employees[['employee_id', 'name', 'department']].head(5),
df_employees[['age', 'salary', 'bonus']].head(5)],
axis=1)
print("\n水平拼接结果(前5行):")
print(df_horizontal)
6. 合并实战案例
print("=" * 60)
print("6. 合并实战案例")
print("=" * 60)
def create_employee_report(df_emp, df_sale, df_dept):
"""创建员工综合报告"""
# 1. 计算员工销售统计
emp_sales = df_sale.groupby('employee_id').agg({
'total_amount': ['sum', 'mean', 'count'],
'quantity': 'sum'
}).reset_index()
emp_sales.columns = ['employee_id', 'total_sales', 'avg_sale', 'sale_count', 'total_quantity']
# 2. 合并员工信息和销售数据
report = pd.merge(df_emp, emp_sales, on='employee_id', how='left')
# 3. 合并部门信息
report = pd.merge(report, df_dept, on='department', how='left')
# 4. 填充缺失值
report['total_sales'] = report['total_sales'].fillna(0)
report['sale_count'] = report['sale_count'].fillna(0)
# 5. 计算总薪酬
report['total_compensation'] = report['salary'] + report['bonus'].fillna(0)
return report
df_report = create_employee_report(df_employees, df_sales, df_dept_info)
print("\n员工综合报告(前5行):")
print(df_report[['name', 'department', 'salary', 'total_sales', 'sale_count', 'total_compensation']].head())
交叉表(Crosstab)
交叉表用于计算两个或多个分类变量的频率分布。
1. 基本交叉表
print("=" * 60)
print("1. 基本交叉表")
print("=" * 60)
# 创建部门和职位的交叉表
crosstab_dept_pos = pd.crosstab(df_employees['department'], df_employees['position'])
print("\n部门和职位交叉表:")
print(crosstab_dept_pos)
# 添加总计
crosstab_with_margins = pd.crosstab(
df_employees['department'],
df_employees['position'],
margins=True,
margins_name='总计'
)
print("\n带总计的交叉表:")
print(crosstab_with_margins)
2. 多变量交叉表
print("=" * 60)
print("2. 多变量交叉表")
print("=" * 60)
# 创建部门和职位的交叉表,按年龄分组
crosstab_multi = pd.crosstab(
[df_employees['department'], df_employees['position']],
df_employees['age'] // 10 * 10, # 按10岁分组
margins=True
)
print("\n多变量交叉表(部门和职位 vs 年龄组):")
print(crosstab_multi.head(10))
3. 使用聚合函数的交叉表
print("=" * 60)
print("3. 使用聚合函数的交叉表")
print("=" * 60)
# 计算平均薪资的交叉表
crosstab_avg_salary = pd.crosstab(
df_employees['department'],
df_employees['position'],
values=df_employees['salary'],
aggfunc='mean'
)
print("\n各部门各职位的平均薪资:")
print(crosstab_avg_salary.round(2))
# 计算总奖金的交叉表
crosstab_sum_bonus = pd.crosstab(
df_employees['department'],
df_employees['position'],
values=df_employees['bonus'].fillna(0),
aggfunc='sum'
)
print("\n各部门各职位的总奖金:")
print(crosstab_sum_bonus)
4. 归一化交叉表
print("=" * 60)
print("4. 归一化交叉表")
print("=" * 60)
# 按行归一化(每行百分比)
crosstab_normalize_rows = pd.crosstab(
df_employees['department'],
df_employees['position'],
normalize='index' # 按行归一化
) * 100
print("\n按行归一化的交叉表(百分比):")
print(crosstab_normalize_rows.round(2))
# 按列归一化(每列百分比)
crosstab_normalize_cols = pd.crosstab(
df_employees['department'],
df_employees['position'],
normalize='columns' # 按列归一化
) * 100
print("\n按列归一化的交叉表(百分比):")
print(crosstab_normalize_cols.round(2))
# 全部归一化(总体百分比)
crosstab_normalize_all = pd.crosstab(
df_employees['department'],
df_employees['position'],
normalize='all' # 全部归一化
) * 100
print("\n全部归一化的交叉表(百分比):")
print(crosstab_normalize_all.round(2))
5. 交叉表实战案例
print("=" * 60)
print("5. 交叉表实战案例")
print("=" * 60)
# 创建销售数据的交叉表
df_sales_with_dept = df_sales.merge(
df_employees[['employee_id', 'department']],
on='employee_id',
how='left'
)
# 地区和产品的交叉表
crosstab_region_product = pd.crosstab(
df_sales_with_dept['region'],
df_sales_with_dept['product'],
values=df_sales_with_dept['total_amount'],
aggfunc='sum',
margins=True
)
print("\n各地区各产品的销售总额:")
print(crosstab_region_product)
# 部门和产品的交叉表(平均销售额)
crosstab_dept_product = pd.crosstab(
df_sales_with_dept['department'],
df_sales_with_dept['product'],
values=df_sales_with_dept['total_amount'],
aggfunc='mean'
)
print("\n各部门各产品的平均销售额:")
print(crosstab_dept_product.round(2))
透视表(Pivot Table)
透视表是数据分析中最强大的工具之一,可以快速进行多维度分析。
1. 基本透视表
print("=" * 60)
print("1. 基本透视表")
print("=" * 60)
# 创建部门和职位的薪资透视表
pivot_salary = pd.pivot_table(
df_employees,
values='salary',
index='department',
columns='position',
aggfunc='mean'
)
print("\n各部门各职位的平均薪资透视表:")
print(pivot_salary.round(2))
# 多个值列
pivot_multi = pd.pivot_table(
df_employees,
values=['salary', 'age', 'bonus'],
index='department',
columns='position',
aggfunc='mean'
)
print("\n多个值列的透视表:")
print(pivot_multi.round(2))
2. 多级索引透视表
print("=" * 60)
print("2. 多级索引透视表")
print("=" * 60)
# 多级行索引
pivot_multi_index = pd.pivot_table(
df_employees,
values='salary',
index=['department', 'position'],
aggfunc=['mean', 'count', 'std']
)
print("\n多级索引透视表:")
print(pivot_multi_index.head(10))
# 多级列索引
pivot_multi_cols = pd.pivot_table(
df_employees,
values='salary',
index='department',
columns=['position'],
aggfunc=['mean', 'max', 'min']
)
print("\n多级列索引透视表:")
print(pivot_multi_cols)
3. 多个聚合函数
print("=" * 60)
print("3. 多个聚合函数")
print("=" * 60)
# 对同一列使用多个聚合函数
pivot_multi_agg = pd.pivot_table(
df_employees,
values='salary',
index='department',
columns='position',
aggfunc=['mean', 'median', 'std', 'count']
)
print("\n多个聚合函数的透视表:")
print(pivot_multi_agg)
# 对不同列使用不同聚合函数
pivot_diff_agg = pd.pivot_table(
df_employees,
values=['salary', 'age', 'bonus'],
index='department',
aggfunc={
'salary': ['mean', 'max'],
'age': 'mean',
'bonus': 'sum'
}
)
print("\n不同列不同聚合函数的透视表:")
print(pivot_diff_agg)
4. 处理缺失值
print("=" * 60)
print("4. 处理缺失值")
print("=" * 60)
# 填充缺失值
pivot_fillna = pd.pivot_table(
df_employees,
values='bonus',
index='department',
columns='position',
aggfunc='mean',
fill_value=0 # 填充缺失值为0
)
print("\n填充缺失值的透视表:")
print(pivot_fillna.round(2))
# 删除全部为 NaN 的行或列
pivot_dropna = pd.pivot_table(
df_employees,
values='bonus',
index='department',
columns='position',
aggfunc='mean',
dropna=False # 保留包含 NaN 的行列
)
print("\n保留 NaN 的透视表:")
print(pivot_dropna.round(2))
5. 透视表实战案例
print("=" * 60)
print("5. 透视表实战案例")
print("=" * 60)
# 合并销售数据和员工数据
df_sales_analysis = df_sales.merge(
df_employees[['employee_id', 'department', 'position']],
on='employee_id',
how='left'
)
# 创建销售分析透视表
pivot_sales = pd.pivot_table(
df_sales_analysis,
values='total_amount',
index=['department', 'region'],
columns='product',
aggfunc=['sum', 'mean', 'count'],
fill_value=0,
margins=True,
margins_name='总计'
)
print("\n销售分析透视表(部门和地区 vs 产品):")
print(pivot_sales.head(15))
# 按日期分组的销售透视表
df_sales_analysis['month'] = df_sales_analysis['sale_date'].dt.to_period('M')
pivot_monthly = pd.pivot_table(
df_sales_analysis,
values='total_amount',
index='month',
columns='product',
aggfunc='sum',
fill_value=0
)
print("\n月度销售透视表:")
print(pivot_monthly)
6. 高级透视表技巧
print("=" * 60)
print("6. 高级透视表技巧")
print("=" * 60)
# 使用自定义函数
def q25(x):
"""25分位数"""
return x.quantile(0.25)
def q75(x):
"""75分位数"""
return x.quantile(0.75)
pivot_quantiles = pd.pivot_table(
df_employees,
values='salary',
index='department',
columns='position',
aggfunc=[q25, 'median', q75]
)
print("\n使用分位数的透视表:")
print(pivot_quantiles.round(2))
# 透视表后重置索引
pivot_reset = pd.pivot_table(
df_employees,
values='salary',
index='department',
columns='position',
aggfunc='mean'
).reset_index()
print("\n重置索引后的透视表:")
print(pivot_reset.head())
综合实战案例
以下是一个综合运用所有技术的实战案例:
print("=" * 60)
print("综合实战案例:员工绩效分析系统")
print("=" * 60)
def comprehensive_analysis():
"""综合数据分析"""
# 1. 数据清理
print("\n1. 数据清理")
df_clean = df_employees.copy()
# 填充缺失值
df_clean['bonus'] = df_clean.groupby('department')['bonus'].transform(
lambda x: x.fillna(x.median() if not x.isna().all() else 0)
)
df_clean['performance_score'] = df_clean['performance_score'].fillna(
df_clean['performance_score'].median()
)
print("缺失值处理完成")
# 2. 合并销售数据
print("\n2. 合并销售数据")
df_sales_summary = df_sales.groupby('employee_id').agg({
'total_amount': 'sum',
'quantity': 'sum',
'sale_id': 'count'
}).reset_index()
df_sales_summary.columns = ['employee_id', 'total_sales', 'total_quantity', 'sale_count']
df_analysis = pd.merge(df_clean, df_sales_summary, on='employee_id', how='left')
df_analysis['total_sales'] = df_analysis['total_sales'].fillna(0)
df_analysis['sale_count'] = df_analysis['sale_count'].fillna(0)
print(f"合并完成,共 {len(df_analysis)} 条记录")
# 3. 分组分析
print("\n3. 分组分析")
dept_analysis = df_analysis.groupby('department').agg({
'salary': ['mean', 'median', 'std'],
'age': 'mean',
'performance_score': 'mean',
'total_sales': 'sum',
'employee_id': 'count'
})
dept_analysis.columns = ['平均薪资', '薪资中位数', '薪资标准差', '平均年龄',
'平均绩效', '总销售额', '员工数']
print("\n各部门分析:")
print(dept_analysis.round(2))
# 4. 交叉表分析
print("\n4. 交叉表分析")
crosstab_analysis = pd.crosstab(
df_analysis['department'],
df_analysis['position'],
values=df_analysis['performance_score'],
aggfunc='mean',
margins=True
)
print("\n各部门各职位平均绩效:")
print(crosstab_analysis.round(2))
# 5. 透视表分析
print("\n5. 透视表分析")
pivot_analysis = pd.pivot_table(
df_analysis,
values=['salary', 'total_sales', 'performance_score'],
index='department',
columns='position',
aggfunc={
'salary': 'mean',
'total_sales': 'sum',
'performance_score': 'mean'
},
fill_value=0
)
print("\n综合透视表:")
print(pivot_analysis.round(2))
# 6. 高级分析:计算绩效排名
print("\n6. 绩效排名分析")
df_analysis['dept_rank'] = df_analysis.groupby('department')['performance_score'].rank(
ascending=False, method='dense'
)
df_analysis['salary_percentile'] = df_analysis.groupby('department')['salary'].transform(
lambda x: pd.qcut(x, q=4, labels=['低', '中低', '中高', '高'], duplicates='drop')
)
print("\n各部门绩效排名前3:")
for dept in df_analysis['department'].unique():
top3 = df_analysis[df_analysis['department'] == dept].nlargest(3, 'performance_score')
print(f"\n{dept}:")
print(top3[['name', 'performance_score', 'salary', 'total_sales']])
return df_analysis
# 执行分析
df_final = comprehensive_analysis()
print("\n" + "=" * 60)
print("分析完成!")
print("=" * 60)
性能优化技巧
1. 使用分类数据类型
print("=" * 60)
print("性能优化技巧")
print("=" * 60)
# 将字符串列转换为分类类型
df_optimized = df_employees.copy()
df_optimized['department'] = df_optimized['department'].astype('category')
df_optimized['position'] = df_optimized['position'].astype('category')
print("\n内存使用对比:")
print(f"原始数据内存: {df_employees.memory_usage(deep=True).sum() / 1024:.2f} KB")
print(f"优化后内存: {df_optimized.memory_usage(deep=True).sum() / 1024:.2f} KB")
2. 使用查询优化
# 使用 query() 方法(通常更快)
result_query = df_employees.query('age > 30 and salary > 15000')
# 使用布尔索引(传统方法)
result_bool = df_employees[(df_employees['age'] > 30) & (df_employees['salary'] > 15000)]
print(f"\n查询结果数量: {len(result_query)}")
3. 避免链式赋值
# 不好的做法(链式赋值)
# df_employees[df_employees['age'] > 50]['salary'] = 20000 # 会警告
# 好的做法
df_employees.loc[df_employees['age'] > 50, 'salary'] = 20000
4. 使用向量化操作
# 向量化操作(快)
df_employees['total_comp'] = df_employees['salary'] + df_employees['bonus'].fillna(0)
# 避免循环(慢)
# for idx, row in df_employees.iterrows():
# df_employees.loc[idx, 'total_comp'] = row['salary'] + (row['bonus'] or 0)
总结
本文详细介绍了 Pandas 的进阶功能:
- 缺失值处理:检测、删除、填充缺失值的多种方法
- 数据分组:GroupBy 的强大功能和灵活应用
- 数据合并:Merge、Join、Concat 的使用场景
- 交叉表:快速生成频率分布表
- 透视表:多维度数据分析的强大工具
关键要点
- 缺失值处理:根据业务需求选择合适的填充策略
- 分组操作:灵活使用 transform、filter、apply 等函数
- 数据合并:理解不同连接方式的区别和适用场景
- 交叉表和透视表:快速进行多维度数据分析
实践建议
- 在实际项目中多练习这些操作
- 注意数据量较大时的性能优化
- 理解每个操作的底层原理
- 结合业务场景灵活运用