失眠网 > Python量化交易学习笔记（47）——因子扩展

Python量化交易学习笔记（47）——因子扩展

时间：2020-02-04 05:27:46

用这篇文章记录一下目前学习过程中使用到的因子，包括使用纯指标规则以及使用机器学习方法所使用到的因子，以便于后续文章描述使用。这里以日线数据为例，周线、月线的因子扩展需要注意适度减小均线周期大小等参数。

规则选股因子扩展

全部因子扩展代码在本节末，这里进行几点说明：

只实现了部分因子的扩展，便于后面规则选股时使用。可以根据具体需要，增删相应因子。在首次运行代码时，会进行扩展因子的全量计算；当后续日线数据更新后，再运行代码，只会计算更新日期的扩展因子，避免了全量计算而带来的过长时间消耗。即实现了增量计算。MACD及均线指标实现，参考了字王的topq_talib包。macd_ext指标实现的是，当前时间点向前，第1、2、3块红柱、绿柱的面积，可用于辅助背离判断。异动量及收复参考了微博大V煎蛋的因子。底分型的实现只用了近3根K线数据，未实现缠论中的合并规则。使用shift的操作，将前n日因子合并到当前日期上，便于后续选股使用。合并后的名称采用“因子_na”的形式来表示，例如close_3a表示3天前的收盘价。

import os.path # 用于管理路径import sys # 用于在argvTo[0]中找到脚本名称import pandas as pdimport time# 获取当前目录proj_path = os.path.dirname(os.path.abspath(sys.argv[0])) + '/../'g_ma_list = [5, 10, 20, 30, 60, 120, 250]g_vol_ma_list = [5, 10, 135]g_shift_n = 5g_min_period = 150# macddef MACD(df, n_fast, n_slow, ksgn='close'):xnam = 'mdiff' # 'macd'xnam2 = 'mdea' # 'msign'xnam3 = 'macd' # 'mdiff'EMAfast = df[ksgn].ewm(span=n_fast, min_periods=n_fast - 1).mean()EMAslow = df[ksgn].ewm(span=n_slow, min_periods=n_slow - 1).mean()mdiff = pd.Series(EMAfast - EMAslow, name=xnam) # difxnum = max(int((n_fast + n_slow) / 4), 2)mdea = mdiff.ewm(span=xnum, min_periods=xnum - 1).mean() # DEA or DEMmdea.name = xnam2macd = pd.Series(mdiff - mdea, name=xnam3).map(lambda x: x * 2)df = df.join(macd)df = df.join(mdea)df = df.join(mdiff)return df# 均线def MA_n(df, n, ksgn='close'):xnam = '{}ma_{}'.format('' if 'close' == ksgn else ksgn + '_', n)ds2 = pd.Series(df[ksgn], name=xnam, index=df.index)ds5 = ds2.rolling(center=False, window=n).mean()df = df.join(ds5)return df# macd指标中，前n段中，红色、绿色柱面积def macd_ext(df, n):df['macd_1a'] = df[['macd']].shift(1)df['macd_switch'] = df.apply(lambda x: 1 if x.macd > 0 and x.macd_1a < 0 else (-1 if x.macd < 0 and x.macd_1a > 0 else 0), axis=1)red = []green = []# 深拷贝for i in range(n):red.append([0.0] * df.shape[0])green.append([0.0] * df.shape[0])curr_red = [0.0] * ncurr_green = [0.0] * naccu_value = 0for i in range(df.shape[0]):if pd.isna(df['macd'].iloc[i]):continueif 1 == df['macd_switch'].iloc[i]:for j in range(n - 1, 0, -1):curr_green[j] = curr_green[j - 1]curr_green[0] = accu_valueaccu_value = df['macd'].iloc[i]elif -1 == df['macd_switch'].iloc[i]:for j in range(n - 1, 0, -1):curr_red[j] = curr_red[j - 1]curr_red[0] = accu_valueaccu_value = df['macd'].iloc[i]else:accu_value += df['macd'].iloc[i]for j in range(n):red[j][i] = curr_red[j]green[j][i] = curr_green[j]for i in range(n):temp_series = pd.Series(red[i], name='red{}'.format(i))temp_series.index = df.indexdf = df.join(temp_series)temp_series = pd.Series(green[i], name='green{}'.format(i))temp_series.index = df.indexdf = df.join(temp_series)return df# 缩量阴线，前1日暴涨def shrink_negative_line(df):df['shrink_negative_line'] = df.apply(lambda x: 1 if ((x.close_1 - x.close_2) / x.close_2) > 0.09 and \x.volume < x.volume_1 and \x.close < x.open and \x.low > x.low_1 and \x.close < x.close_1 else 0, axis=1)return df# 缩量def shrink_volume(df):df['shrink_volume'] = df.apply(lambda x: 1 if x.volume < x.volume_1a else 0, axis=1)return df# 暴量，成交量大于135日均量线def volume_boom(df):df['volume_boom'] = df.apply(lambda x: 1 if x.volume > x.volume_ma_135 else 0, axis=1)return df# 暴涨，涨幅大于9%def value_boom(df):df['value_boom'] = df.apply(lambda x: 1 if (x.close - x.close_1a) / x.close_1a > 0.09 else 0, axis=1)return df# 底分型def bottom_shape(df):df['bottom_shape'] = df.apply(lambda x: 1 if x.low_1a < x.low_2a and x.low_1a < x.low and x.high_1a < x.high_2a and x.high_1a < x.high else 0,axis=1)return df# 基于异动量计算异动量收复def retrieve_special_volume(df):# 按条件生成新列df['retrieve_special_volume'] = df.apply(lambda x: 1 if 1 == x.special_volume_1a and x.close > x.high_1a and x.close > x.open else 0, axis=1)return df# 阳线def positive(df):df['positive'] = df.apply(lambda x: 1 if x.close > x.open else 0, axis=1)return df# 阴线def negative(df):df['negative'] = df.apply(lambda x: 1 if x.close < x.open else 0, axis=1)return df# 异动量def special_volume(df):# 按条件生成新列df['special_volume'] = df.apply(lambda x: 1 if x.open > x.close and x.close < x.close_1a and x.volume > x.volume_1a else 0, axis=1)return df# 将前n日的指标列入当日指标def shift_till_n(df, indicator_list, n):for i in range(n):shift_i(df, indicator_list, i + 1)return df# 将第前n日的指标列入当日指标def shift_i(df, indicator_list, i):for ind in indicator_list:df['{}_{}a'.format(ind, i)] = df[ind].shift(i)return dfif __name__ == '__main__':# 程序开始时的时间time_start = time.time()# 读入股票代码stock_codes = pd.read_csv(proj_path + 'data/tdx/all_stock_codes.csv', encoding='unicode_escape')# 创建写出目录out_dir = proj_path + 'data/extension/d/hard_rules/'if not os.path.exists(out_dir):os.makedirs(out_dir)# 循环处理每只股票for code in stock_codes['code']:print('processing {}...'.format(code))input_file = proj_path + 'data/tdx/day/' + code + '.csv'if not os.path.exists(input_file):continueoutput_file = out_dir + code + '.csv'exist_df = pd.DataFrame()df = pd.read_csv(input_file)df = df.sort_index(ascending=True)# 用于更新数据时，减少计算规模g_min_period = max(g_min_period, g_shift_n, max(g_vol_ma_list), max(g_ma_list))new_lines = 0# 已有部分计算结果if os.path.exists(output_file):exist_df = pd.read_csv(output_file)last_date = exist_df['date'].iloc[-1]date_index = df[df.date == last_date].index.tolist()[0]new_lines = df.shape[0] - date_index - 1df = df.iloc[- g_min_period - new_lines:]# 没有要新计算的行if not new_lines:continue# vol_MAfor i in g_vol_ma_list:df = MA_n(df, i, 'volume')# mafor i in g_ma_list:df = MA_n(df, i)# 计算回滚参数indicator_list = ['open', 'high', 'low', 'close', 'volume', 'amount']indicator_list.extend(list(map(lambda x: 'ma_' + str(x), g_ma_list)))indicator_list.extend(list(map(lambda x: 'volume_ma_' + str(x), g_vol_ma_list)))df = shift_till_n(df, indicator_list, g_shift_n)# 计算异动量df = special_volume(df)df = shift_till_n(df, ['special_volume'], g_shift_n)# 异动量收复df = retrieve_special_volume(df)# 底分型df = bottom_shape(df)# MACDdf = MACD(df, 12, 26)df = macd_ext(df, 3)# 计算暴涨df = value_boom(df)df = shift_till_n(df, ['value_boom'], g_shift_n)# 计算量暴涨df = volume_boom(df)df = shift_till_n(df, ['volume_boom'], g_shift_n)# 计算缩量df = shrink_volume(df)df = shift_till_n(df, ['shrink_volume'], g_shift_n)# df = shrink_negative_line(df)# 计算阳线、阴线df = positive(df)df = negative(df)df = shift_till_n(df, ['positive', 'negative'], g_shift_n)if new_lines:df = exist_df.append(df.iloc[-new_lines:])# 写出文件df.to_csv(output_file, index=False)print(code + ' done!')# 程序结束时系统时间time_end = time.time()print('程序所耗时间：', time_end - time_start)

机器学习选股因子扩展

全部扩展因子在本节末，几点说明：

这里的扩展因子拟应用于机器学习，将选股处理成二分类问题，因此需要计算标签信息。使用class_label方法来计算相应的标签值。复用了上一节的因子，也使用pandas_ta实现了大量因子的计算。关键代码：

df.ta.strategy(exclude=['dpo', 'psar', 'supertrend', 'ichimoku', 'hilo'], verbose=True, timed=True)

其中，去除的因子在很多时候没有输出值，会影响机器学习的计算。处理后因子总维度为303。

未实现增量计算，即每次都对全量因子进行计算。主要原因是没有对pandas_ta进行深度研究，无法判断增量计算的结果。对因子和最后结果的相关性进行计算，发现与成交量相关的因子和最后的结果相关性最高。

import os.path # 用于管理路径import sys # 用于在argvTo[0]中找到脚本名称import pandas as pdimport timeimport pandas_ta as ta# 获取当前目录proj_path = os.path.dirname(os.path.abspath(sys.argv[0])) + '/../'g_ma_list = [5, 10, 20, 30, 60, 120, 250]g_vol_ma_list = [5, 10, 135]g_shift_n = 5g_ml_min_period = 1500# macddef MACD(df, n_fast, n_slow, ksgn='close'):xnam = 'mdiff' # 'macd'xnam2 = 'mdea' # 'msign'xnam3 = 'macd' # 'mdiff'EMAfast = df[ksgn].ewm(span=n_fast, min_periods=n_fast - 1).mean()EMAslow = df[ksgn].ewm(span=n_slow, min_periods=n_slow - 1).mean()mdiff = pd.Series(EMAfast - EMAslow, name=xnam) # difxnum = max(int((n_fast + n_slow) / 4), 2)mdea = mdiff.ewm(span=xnum, min_periods=xnum - 1).mean() # DEA or DEMmdea.name = xnam2macd = pd.Series(mdiff - mdea, name=xnam3).map(lambda x: x * 2)df = df.join(macd)df = df.join(mdea)df = df.join(mdiff)return df# 均线def MA_n(df, n, ksgn='close'):xnam = '{}ma_{}'.format('' if 'close' == ksgn else ksgn + '_', n)ds2 = pd.Series(df[ksgn], name=xnam, index=df.index)ds5 = ds2.rolling(center=False, window=n).mean()df = df.join(ds5)return df# macd指标中，前n段中，红色、绿色柱面积def macd_ext(df, n):df['macd_1a'] = df[['macd']].shift(1)df['macd_switch'] = df.apply(lambda x: 1 if x.macd > 0 and x.macd_1a < 0 else (-1 if x.macd < 0 and x.macd_1a > 0 else 0), axis=1)red = []green = []# 深拷贝for i in range(n):red.append([0.0] * df.shape[0])green.append([0.0] * df.shape[0])curr_red = [0.0] * ncurr_green = [0.0] * naccu_value = 0for i in range(df.shape[0]):if pd.isna(df['macd'].iloc[i]):continueif 1 == df['macd_switch'].iloc[i]:for j in range(n - 1, 0, -1):curr_green[j] = curr_green[j - 1]curr_green[0] = accu_valueaccu_value = df['macd'].iloc[i]elif -1 == df['macd_switch'].iloc[i]:for j in range(n - 1, 0, -1):curr_red[j] = curr_red[j - 1]curr_red[0] = accu_valueaccu_value = df['macd'].iloc[i]else:accu_value += df['macd'].iloc[i]for j in range(n):red[j][i] = curr_red[j]green[j][i] = curr_green[j]for i in range(n):temp_series = pd.Series(red[i], name='red{}'.format(i))temp_series.index = df.indexdf = df.join(temp_series)temp_series = pd.Series(green[i], name='green{}'.format(i))temp_series.index = df.indexdf = df.join(temp_series)return df# 缩量阴线，前1日暴涨def shrink_negative_line(df):df['shrink_negative_line'] = df.apply(lambda x: 1 if ((x.close_1 - x.close_2) / x.close_2) > 0.09 and \x.volume < x.volume_1 and \x.close < x.open and \x.low > x.low_1 and \x.close < x.close_1 else 0, axis=1)return df# 缩量def shrink_volume(df):df['shrink_volume'] = df.apply(lambda x: 1 if x.volume < x.volume_1a else 0, axis=1)return df# 暴量，成交量大于135日均量线def volume_boom(df):df['volume_boom'] = df.apply(lambda x: 1 if x.volume > x.volume_ma_135 else 0, axis=1)return df# 暴涨，涨幅大于9%def value_boom(df):df['value_boom'] = df.apply(lambda x: 1 if (x.close - x.close_1a) / x.close_1a > 0.09 else 0, axis=1)return df# 底分型def bottom_shape(df):df['bottom_shape'] = df.apply(lambda x: 1 if x.low_1a < x.low_2a and x.low_1a < x.low and x.high_1a < x.high_2a and x.high_1a < x.high else 0,axis=1)return df# 基于异动量计算异动量收复def retrieve_special_volume(df):# 按条件生成新列df['retrieve_special_volume'] = df.apply(lambda x: 1 if 1 == x.special_volume_1a and x.close > x.high_1a and x.close > x.open else 0, axis=1)return df# 阳线def positive(df):df['positive'] = df.apply(lambda x: 1 if x.close > x.open else 0, axis=1)return df# 阴线def negative(df):df['negative'] = df.apply(lambda x: 1 if x.close < x.open else 0, axis=1)return df# 异动量def special_volume(df):# 按条件生成新列df['special_volume'] = df.apply(lambda x: 1 if x.open > x.close and x.close < x.close_1a and x.volume > x.volume_1a else 0, axis=1)return df# 将前n日的指标列入当日指标def shift_till_n(df, indicator_list, n):for i in range(n):shift_i(df, indicator_list, i + 1)return df# 将第前n日的指标列入当日指标def shift_i(df, indicator_list, i):for ind in indicator_list:df['{}_{}a'.format(ind, i)] = df[ind].shift(i)return df# 计算最大收益def max_profit(x, percent_change=0.1):ret = 0if (max(x) - x.iloc[-1]) / x.iloc[-1] >= percent_change:ret = 1return ret# 计算是否能够在days日内的实现收益percent_changedef class_label(df, days, percent_change):df['label_{}_{}%'.format(days, percent_change * 100)] = (df.iloc[::-1]['close'].rolling(days + 1).apply(max_profit,kwargs={'percent_change': percent_change})).iloc[::-1]return dfif __name__ == '__main__':# 程序开始时的时间time_start = time.time()# 机器学习stock_code_file = proj_path + 'data/tdx/ml_stock_code.csv'if not os.path.exists(stock_code_file):all_stock_code_file = proj_path + 'data/tdx/all_stock_codes.csv'stock_codes = pd.read_csv(all_stock_code_file, encoding='unicode_escape')ml_stock_list = []# 筛选股票，确保有充足的训练数据for code in stock_codes['code']:input_file = proj_path + 'data/tdx/day/' + code + '.csv'if not os.path.exists(input_file):continuedf = pd.read_csv(input_file)if df.shape[0] > g_ml_min_period:ml_stock_list.append(code)out_df = pd.DataFrame(ml_stock_list, columns=['code'])out_df.to_csv(stock_code_file, index=False)stock_codes = pd.read_csv(stock_code_file, encoding='unicode_escape')# 创建写出目录out_dir = proj_path + 'data/extension/d/ml/'if not os.path.exists(out_dir):os.makedirs(out_dir)# 循环处理每只股票for code in stock_codes['code']:print('processing {}...'.format(code))input_file = proj_path + 'data/tdx/day/' + code + '.csv'if not os.path.exists(input_file):continueoutput_file = out_dir + code + '.csv'exist_df = pd.DataFrame()df = pd.read_csv(input_file)df = df.sort_index(ascending=True)# 用于更新数据时，减少计算规模df.ta.strategy(exclude=['dpo', 'psar', 'supertrend', 'ichimoku', 'hilo'], verbose=True, timed=True)# vol_MAfor i in g_vol_ma_list:df = MA_n(df, i, 'volume')# mafor i in g_ma_list:df = MA_n(df, i)# 计算回滚参数indicator_list = ['open', 'high', 'low', 'close', 'volume', 'amount']indicator_list.extend(list(map(lambda x: 'ma_' + str(x), g_ma_list)))indicator_list.extend(list(map(lambda x: 'volume_ma_' + str(x), g_vol_ma_list)))df = shift_till_n(df, indicator_list, g_shift_n)# 计算异动量df = special_volume(df)df = shift_till_n(df, ['special_volume'], g_shift_n)# 异动量收复df = retrieve_special_volume(df)# 底分型df = bottom_shape(df)# MACDdf = MACD(df, 12, 26)df = macd_ext(df, 3)# 计算暴涨df = value_boom(df)df = shift_till_n(df, ['value_boom'], g_shift_n)# 计算量暴涨df = volume_boom(df)df = shift_till_n(df, ['volume_boom'], g_shift_n)# 计算缩量df = shrink_volume(df)df = shift_till_n(df, ['shrink_volume'], g_shift_n)# df = shrink_negative_line(df)# 计算阳线、阴线df = positive(df)df = negative(df)df = shift_till_n(df, ['positive', 'negative'], g_shift_n)# 计算分类标准df = class_label(df, 1, 0.095)df = class_label(df, 2, 0.095)df = class_label(df, 5, 0.095)df = class_label(df, 10, 0.095)df = class_label(df, 2, 0.195)df = class_label(df, 5, 0.195)df = class_label(df, 10, 0.195)# 写出文件df.to_csv(output_file, index=False)print(code + ' done!')# 程序结束时系统时间time_end = time.time()print('程序所耗时间：', time_end - time_start)

欢迎大家关注、点赞、转发、留言，感谢支持！

微信群用于学习交流，感兴趣的读者请扫码加微信！

QQ群（676186743）用于资料共享，欢迎加入！

如果觉得《Python量化交易学习笔记（47）——因子扩展》对你有帮助，请点赞、收藏，并留下你的观点哦！

本内容不代表本网观点和政治立场，如有侵犯你的权益请联系我们处理。

网友评论

网友评论仅供其表达个人看法，并不表明网站立场。