数据读取
file_name = "../track2/final_track2_train.txt"
column_names = ["uid", "user_city",
"item_id", "author_id", "item_city", "channel",
"finish", "like", "music_id", "device", "time",
"duration_time"]
df = pd.read_csv(file_name, sep='\t', header=None, names=column_names)
行处理和列处理
apply函数 https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.apply.html axis=0 (默认) , func的输入为DataFrame的一列, 类型为Series, index为DataFrame的行索引 axis=1时, func的输入为DataFrame的一行, 类型为Series, index为DataFrame的列索引
df = pd.DataFrame([[4, 9],] * 3, columns=['A', 'B'])
df.apply(lambda x: x.name) # 获取列名
# 统计每列有多少种不同的取值
def uniq_count(series):
return len(np.unique(series))
df.apply(uniq_count, axis=0)
# 统计每列的最大最小值
def minMax(x):
return pd.Series(index=['min','max'], data=[x.min(),x.max()])
df.apply(minMax)
# 多列应用同一个函数, cols 为list
df[cols] = df[cols].apply(lambda x: x.clip(lower=0))
分组汇总
# 每个组有多少条数据
df.groupby(["uid"]).size().reset_index(name='counts')
DataFrame.apply 和 GroupBy.apply 不同, - DataFrame.apply 函数传递给自定义函数的是Series对象, apply有自己的参数 - GroupBy.apply 函数传递给自定义函数的是DataFrame对象, apply会将参数也传递给自定义函数
def item_count(df):
return len(np.unique(df["item_id"]))
user_item_count = df[["uid","item_id"]].groupby(["uid"]).apply(item_count)
# use apply and return Series, Use the Series index as labels for the new columns
eg1:
def f(x):
d = {}
d['a_sum'] = x['a'].sum()
d['a_max'] = x['a'].max()
d['b_mean'] = x['b'].mean()
d['c_d_prodsum'] = (x['c'] * x['d']).sum()
return pd.Series(d, index=['a_sum', 'a_max', 'b_mean', 'c_d_prodsum'])
df.groupby('group').apply(f)
a_sum a_max b_mean c_d_prodsum
group
0 0.560541 0.507058 0.418546 0.118106
1 0.187757 0.157958 0.887315 0.276808
eg2:
like_info = data[['like']].groupby(['like']).size()
def extract_feature(df):
d = {}
d['user_city_pv'] = len(df['like'])
d['user_city_clk'] = sum(df['like'])
d['user_city_pvbeta'] = d['user_city_pv'] + like_info[0] + like_info[1]
d['user_city_clkbeta'] = d['user_city_pv'] + like_info[1]
d['user_city_ctrbeta'] = d['user_city_clkbeta'] / d['user_city_pvbeta']
return pd.Series(d, index=['user_city_pv', 'user_city_clk',
'user_city_pvbeta', 'user_city_clkbeta',
'user_city_ctrbeta'])
data[['user_city', 'like']].groupby(['user_city']).apply(extract_feature)
数据拼接
data = data.merge(item_city_data, on=['item_city'], how='left')
# 按行拼接
data = pd.concat([data_train, data_test], axis=0)