Skip to content

数据读取

file_name = "../track2/final_track2_train.txt"
column_names = ["uid", "user_city",
    "item_id", "author_id", "item_city", "channel",
    "finish", "like", "music_id", "device", "time",
    "duration_time"]
df = pd.read_csv(file_name, sep='\t', header=None, names=column_names)

行处理和列处理

apply函数 https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.apply.html axis=0 (默认) , func的输入为DataFrame的一列, 类型为Series, index为DataFrame的行索引 axis=1时, func的输入为DataFrame的一行, 类型为Series, index为DataFrame的列索引

df = pd.DataFrame([[4, 9],] * 3, columns=['A', 'B'])
df.apply(lambda x: x.name) # 获取列名

# 统计每列有多少种不同的取值
def uniq_count(series):
    return len(np.unique(series))
df.apply(uniq_count, axis=0)
# 统计每列的最大最小值
def minMax(x):
    return pd.Series(index=['min','max'], data=[x.min(),x.max()])
df.apply(minMax)
# 多列应用同一个函数, cols 为list
df[cols] = df[cols].apply(lambda x: x.clip(lower=0))

分组汇总

# 每个组有多少条数据
df.groupby(["uid"]).size().reset_index(name='counts')

DataFrame.apply 和 GroupBy.apply 不同, - DataFrame.apply 函数传递给自定义函数的是Series对象, apply有自己的参数 - GroupBy.apply 函数传递给自定义函数的是DataFrame对象, apply会将参数也传递给自定义函数

def item_count(df):
    return len(np.unique(df["item_id"]))
user_item_count = df[["uid","item_id"]].groupby(["uid"]).apply(item_count)
# use apply and return Series, Use the Series index as labels for the new columns

eg1: 
def f(x):
    d = {}
    d['a_sum'] = x['a'].sum()
    d['a_max'] = x['a'].max()
    d['b_mean'] = x['b'].mean()
    d['c_d_prodsum'] = (x['c'] * x['d']).sum()
    return pd.Series(d, index=['a_sum', 'a_max', 'b_mean', 'c_d_prodsum'])

df.groupby('group').apply(f)


          a_sum     a_max    b_mean  c_d_prodsum
group                                           
0      0.560541  0.507058  0.418546     0.118106
1      0.187757  0.157958  0.887315     0.276808


eg2:
like_info = data[['like']].groupby(['like']).size()

def extract_feature(df):
    d = {}
    d['user_city_pv'] = len(df['like'])
    d['user_city_clk'] = sum(df['like'])
    d['user_city_pvbeta'] = d['user_city_pv'] + like_info[0] + like_info[1]
    d['user_city_clkbeta'] = d['user_city_pv'] + like_info[1]
    d['user_city_ctrbeta'] = d['user_city_clkbeta'] / d['user_city_pvbeta']
    return pd.Series(d, index=['user_city_pv', 'user_city_clk', 
                               'user_city_pvbeta', 'user_city_clkbeta',
                               'user_city_ctrbeta'])

data[['user_city', 'like']].groupby(['user_city']).apply(extract_feature)   

数据拼接

data = data.merge(item_city_data, on=['item_city'], how='left')

# 按行拼接
data = pd.concat([data_train, data_test], axis=0)

shift