失眠网,内容丰富有趣,生活中的好帮手!
失眠网 > 爬虫模拟POST请求https (爬中国银行汇率----中文)

爬虫模拟POST请求https (爬中国银行汇率----中文)

时间:2019-01-02 17:16:43

相关推荐

爬虫模拟POST请求https  (爬中国银行汇率----中文)

.06.26

/search/whpj/search_cn.jsp

因为中行汇率改版本(改为了https请求,页面也有很大变化),需要重新爬中行汇率,原代码请求的数据总与页面返回的不同,最后感觉应该是模拟的POST是失败的,打断点质量是GET模拟。可能是HTTPS影响吧。

试了网上各种方法,测试的的结果 不是POST请求,还是GET,比如,

HttpURLConnection , HttpsURLConnection

// 发送POST请求必须设置如下两行conn.setDoOutput(true);conn.setDoInput(true);httpURLConnection.setRequestMethod("POST");

或ssl

.ssl.X509TrustManager

SSLContext sslContext = SSLContext.getInstance("SSL");TrustManager[] tm = { new MyX509TrustManager() };

// 设置当前实例使用的SSLSoctetFactoryconn.setSSLSocketFactory(ssf);

以上设置都没效果!!!!!!!

如果是http大家可以试网上的通用方法。

----------------------------------------------------------------

import mons.lang.StringUtils;import org.apache.http.HttpEntity;import org.apache.http.HttpStatus;import org.apache.http.client.ClientProtocolException;import org.apache.http.client.methods.CloseableHttpResponse;import org.apache.http.client.methods.HttpPost;import org.apache.http.client.utils.HttpClientUtils;import org.apache.http.impl.client.CloseableHttpClient;import org.apache.http.impl.client.HttpClients;import org.apache.http.util.EntityUtils;import org.joda.time.DateTime;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;import org.springframework.stereotype.Service;import java.io.IOException;import java.util.ArrayList;import java.util.HashMap;import java.util.List;import java.util.Map;/*** 爬取中国银行汇率-----------中** @author lm*/@Servicepublic class CrawlingExchangeRateCNService {public static void main(String[] args) {CrawlingExchangeRateCNService crawlingExchangeRateService = new CrawlingExchangeRateCNService();crawlingExchangeRateService.execute();}public void execute() {// List queryList = getExchangeRate("USD", "");List queryList = getExchangeRate("美元", "");System.out.println("长度:" + queryList.size());System.out.println("汇总:" + queryList);}/*** 获取当日传入币别汇率信息** @param sourceCurrency 币别* @param date 日期* @return*/private List getExchangeRate(String sourceCurrency, String date) {/***判断入参lsDate是否为空,若为空则赋值为当前时间**/String lsToday = StringUtils.isEmpty(date) ? new DateTime().toString("yyyy-MM-dd") : date;List list = new ArrayList();for (int page = 1; page <= 10; page++) {/**抓取时间为lsToday,币别为sourceCurrency,页数为page的中国银行网页信息*/String searchEnHtml = getSearchEnHtml(lsToday, sourceCurrency, String.valueOf(page));/**开始解析html中的汇率列表信息**/Map map = assembleObjByHtml(searchEnHtml, sourceCurrency, lsToday);String flag = (String) map.get("flag");String htmlPage = (String) map.get("page");list.add (map.get("list"));/**当flag为1执行成功时,或总页数等于循环查询到的页数时,则不需要再次进行查询**/if ("1".equals(flag) || Integer.parseInt(htmlPage) < page) {break;}}return list;}/*** 获取整个网页的内容** @param lsToday传入当前时间或空* @param lsSourceCurrency 币种* @param liPage 当前查询页数* @return*/private String getSearchEnHtml(String lsToday, String lsSourceCurrency, String liPage) {// StringBuilder url = new StringBuilder("/search/whpj/searchen.jsp?");StringBuilder url = new StringBuilder("/search/whpj/search_cn.jsp?");url.append("erectDate=").append(lsToday);url.append("&nothing=").append(lsToday);url.append("&pjname=").append(lsSourceCurrency);url.append("&page=").append(liPage);System.out.println("拼接好的url:" + url);CloseableHttpClient httpClient = HttpClients.createDefault();CloseableHttpResponse response = null;HttpPost httpPost = new HttpPost(url.toString());httpPost.addHeader("Content-Type", "application/x-www-form-urlencoded;charset=utf-8");httpPost.setHeader("Accept", "Accept: text/plain, */*");httpPost.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3724.8 Safari/537.36");httpPost.addHeader("x-amazon-user-agent", "AmazonJavascriptScratchpad/1.0 (Language=Javascript)");httpPost.addHeader("X-Requested-With", "XMLHttpRequest");String html = "";try {response = httpClient.execute(httpPost);/**判断响应状态为200,进行处理**/if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {HttpEntity httpEntity = response.getEntity();html = EntityUtils.toString(httpEntity, "utf-8");} else {System.out.println(EntityUtils.toString(response.getEntity(), "utf-8"));}} catch (ClientProtocolException e) {e.printStackTrace();} catch (IOException e) {e.printStackTrace();} finally {HttpClientUtils.closeQuietly(response);HttpClientUtils.closeQuietly(httpClient);}/***返回请求得到的页面**/return html;}/*** 根据取得的网页,解析html中的内容 先不做业务逻辑,全部查询** @param html 要解析的html* @param lsSourceCurrency 币种* @param lsToday日期* @return*/private Map assembleObjByHtml(String html, String lsSourceCurrency, String lsToday) {/**存储数据**/Map map = new HashMap(5);/**使用Jsoup将html解析为Document对象**/Document document = Jsoup.parse(html);/**获取页面隐藏域中存放的当前页数**/Elements pageItem = document.getElementsByAttributeValue("name", "page");String pageItemValue = "";pageItemValue = pageItem.select("input[name=page]").val();map.put("page", pageItemValue);/**获取页面的整个table信息,这个返回的页面基本上是返回多个table,下方需要细化处理**/Elements tables = document.getElementsByTag("table");/**设置存放汇率信息的table下标为-1(默认不存在)**/int tableIndex = -1;/**从table中循环获取,查找含有Currency Name字段的table**/for (int i = 0; i < tables.size(); i++) {Element element = tables.get(i);String text = element.text();/**找到含有汇率信息的table,给tableIndex赋值,跳出循环**/if (text.indexOf("货币名称") > -1) {tableIndex = i;break;}}List<TerstEntity> list = new ArrayList();/**如果找到汇率列表信息**/if (tableIndex > -1) {Element table = tables.get(tableIndex);/**遍历该表格内的所有的<tr> <tr/>*/Elements trs = table.select("tr");for (int i = 1; i < trs.size(); ++i) {TerstEntity terstEntity = new TerstEntity();Element tr = trs.get(i);/**将数据放入实体对象中*/Elements tds = tr.select("td");//过滤 <td colspan="11" style="height:30px;">&nbsp;</td>if(tds !=null && tds.size() == 7){System.out.println(tds.get(0).text() + " "+i);terstEntity.setCurrencyName(tds.get(0).text());terstEntity.setBuyingRate(tds.get(1).text());terstEntity.setCashBuyingRate(tds.get(2).text());terstEntity.setSellingRate(tds.get(3).text());terstEntity.setCashSellingRate(tds.get(4).text());terstEntity.setMiddleRate(tds.get(5).text());terstEntity.setPubTime(tds.get(6).text());list.add(terstEntity);}}map.put("list", list);}else{map.put("flag", "1");}return map;}}

import lombok.Data;/*** 测试使用*/@Datapublic class TerstEntity {private String currencyName;private String buyingRate;private String cashBuyingRate;private String sellingRate;private String cashSellingRate;private String middleRate;private String PubTime;}

<dependency><groupId>org.jsoup</groupId><artifactId>jsoup</artifactId><version>1.12.1</version></dependency>

如果觉得《爬虫模拟POST请求https (爬中国银行汇率----中文)》对你有帮助,请点赞、收藏,并留下你的观点哦!

本内容不代表本网观点和政治立场,如有侵犯你的权益请联系我们处理。
网友评论
网友评论仅供其表达个人看法,并不表明网站立场。