Pandas杂项
Contents
常用代码片段
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
sns.set()
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
mpl.rcParams["axes.unicode_minus"] = False
保存 df 的 plot
plot = df.plot(y='人数', xlim=(0,150), figsize=(10,7), ylabel='人数')
figure = plot.get_figure()
figure.savefig("/tmp/hello.png")
matplot 与日期列
df['dt'] = pd.to_datetime(df['dt'])
df.set_index(['dt'],inplace=True)
df.plot()
设置 matplot 为 seaborn风格
import seaborn as sns; sns.set()
计算均值移动
df['new_col'] = df['column'].rolling(5).mean()
相关性
df.loc[df.na == '沪深300', ['pe', 'pb']].corr(method='pearson')
df.loc[df.na == '沪深300', ['pe', 'pb']].corr(method='spearman')
df.loc[df.na == '沪深300', ['pe', 'pb']].corr(method='kendall')
百分比变化
当前元素, 与前一元素之间的百分比变化. 默认是按行.
改为列:
df.pct_change(axis='columns')
df.loc[df.na == '沪深300', ['pe']].head().pct_change()
根据某列的值来排序所有行
df = df.sort_values(by='DateTime1',ascending=True)
df
转换某列的数据类型
## 将不能转换数据类型的值强制转换成NaN
df['ch'] = pd.to_numeric(df['ch'],errors='coerce')
删除非法值
df = df.dropna()
转换索引类型
df.index = pd.to_datetime(df.index)
看作二维数组
df['col'].values
多个 DF Merge
from functools import reduce
import pandas as pd
dfs = [df0, df1, df2, dfN]
df_final = reduce(lambda left,right: pd.merge(left,right,on='name'), dfs)
Pandas plot string histgram
df['loc1'].value_counts().plot.bar()
调整 plot 画出的大小
df['loc2'].value_counts().plot.bar(figsize=(35,20))
分组后取 TOP
df['brand'].value_counts().nlargest(10).plot.bar(figsize=(10, 5))
按时间列的小时分组
df.groupby(df['bid_time'].dt.hour).agg({'click_cnt':'sum'})
.dt
类似 .str
Group By 后再 subplots
group_df['crate'].unstack().plot.line(subplots=True, figsize=(20, 14))
要调用 unstack()
, subplots 参数才会生效