常用代码片段

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
sns.set()

plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
mpl.rcParams["axes.unicode_minus"] = False

保存 df 的 plot

plot = df.plot(y='人数', xlim=(0,150), figsize=(10,7), ylabel='人数')
figure = plot.get_figure()
figure.savefig("/tmp/hello.png")

matplot 与日期列

df['dt'] = pd.to_datetime(df['dt'])
df.set_index(['dt'],inplace=True)
df.plot()

设置 matplot 为 seaborn风格

import seaborn as sns; sns.set()

计算均值移动

df['new_col'] = df['column'].rolling(5).mean()

相关性

df.loc[df.na == '沪深300', ['pe', 'pb']].corr(method='pearson')

df.loc[df.na == '沪深300', ['pe', 'pb']].corr(method='spearman')

df.loc[df.na == '沪深300', ['pe', 'pb']].corr(method='kendall')

百分比变化

当前元素, 与前一元素之间的百分比变化. 默认是按行.

改为列: df.pct_change(axis='columns')

df.loc[df.na == '沪深300', ['pe']].head().pct_change()

根据某列的值来排序所有行

df = df.sort_values(by='DateTime1',ascending=True)
df

转换某列的数据类型

## 将不能转换数据类型的值强制转换成NaN
df['ch'] = pd.to_numeric(df['ch'],errors='coerce')

删除非法值

df = df.dropna()

转换索引类型

df.index = pd.to_datetime(df.index)

看作二维数组

df['col'].values

多个 DF Merge

from functools import reduce
import pandas as pd

dfs = [df0, df1, df2, dfN]
df_final = reduce(lambda left,right: pd.merge(left,right,on='name'), dfs)

Pandas plot string histgram

df['loc1'].value_counts().plot.bar()

调整 plot 画出的大小

df['loc2'].value_counts().plot.bar(figsize=(35,20))

分组后取 TOP

df['brand'].value_counts().nlargest(10).plot.bar(figsize=(10, 5))

按时间列的小时分组

df.groupby(df['bid_time'].dt.hour).agg({'click_cnt':'sum'})

.dt 类似 .str

Group By 后再 subplots

group_df['crate'].unstack().plot.line(subplots=True, figsize=(20, 14))

要调用 unstack() , subplots 参数才会生效

资料