其实,一开始我觉得应该说实话,我没搞到解析在线pdf,用了迂回战术,先down到本地,再从本地解析的,有盆友知道如何解析在线的请告知哈!
我这次做的是抓取银行各类货币兑外币汇率的数据,起先,是使用HtmlAgilityPack爬虫在线网站网页上货币数据,BUT由于业务增多,财务发来增加了几类币种。 其中除了泰铢,并木有比较少用的像印尼卢比,越南盾这种的货币。然后在网站上找了一圈发现有个在线pdf档有数据,就一路解析爬虫pdf档咯。
网站上的货币汇率数据是这样的: 本次使用了Spire.Pdf和Spire.License这两个DLL。通过工具–库程序包管理器–管理解决方案的NuGet程序包中联网搜索下载的。就是文件太大了。
WebConFig的数据为: <add key="TOACoinName" value="澳元,AUD;加元,CAD;欧罗,EUR;英镑,GBP;美元,USD;人民币(在岸),CNY;日圆,JPY;新加坡元,SGD;菲律宾比索,PHP;马来西亚币,MYR;泰国泰铢,THB;印尼卢比,IDR;越南盾,VND"/> private void CatchUnDollarRate() { StringBuilder strSql = new StringBuilder(); try { //处理HttpWebRequest访问https有安全证书的问题( 请求被中止: 未能创建 SSL/TLS 安全通道。) ServicePointManager.ServerCertificateValidationCallback += (s, cert, chain, sslPolicyErrors) => true; ServicePointManager.SecurityProtocol = SecurityProtocolType.Tls12 | SecurityProtocolType.Tls11 | SecurityProtocolType.Tls; //本地路径 string path = System.Configuration.ConfigurationManager.AppSettings["FILE_ROOT_PATH"].ToString()+ @"\Ratesheet"; if (!Directory.Exists(path)) { Directory.CreateDirectory(path); } string filepath = path + @"\" + DateTime.Now.ToString("yyyy-MM-dd HH.MM.ss") + "Ratesheet.pdf"; //下载到本地,网址就用了个假的啦 DownDocumentPdf(filepath, "https://www.baidu.pdf"); Spire.Pdf.PdfDocument dd = new Spire.Pdf.PdfDocument(); var thread = new Thread(() => { //读取刚刚存到本地路径的PDF dd.LoadFromFile(filepath); }); thread.SetApartmentState(ApartmentState.STA); thread.Start(); thread.Join(); string data = string.Empty; foreach (PdfPageBase page in dd.Pages) { //解析出数据 data = page.ExtractText(); } #region 解析汇率行 Dictionary<string, string> rateDic = new Dictionary<string, string>(); List<string> coinNameList = new List<string>(); string TOACoinName = System.Configuration.ConfigurationManager.AppSettings["TOACoinName"].ToString(); string[] coinname = TOACoinName.Split(';'); Dictionary<string, string> nameToCoinNo = new Dictionary<string, string>(); string coinposition = string.Empty; string coinpername = string.Empty; string coinperno = string.Empty; for (int i = 0; i < coinname.Count(); i++) { coinpername = coinname[i].Split(',')[0]; coinperno = coinname[i].Split(',')[1]; coinNameList.Add(coinpername); nameToCoinNo.Add(coinpername, coinperno); //如果你要查询马来西亚币,coinperno就为MYR,若当前文档中有次货币返回值则!=-1 if (data.IndexOf(coinperno) != -1) { coinposition = data.Substring(data.IndexOf(coinperno), 80); rateDic.Add(coinname[i].Split(',')[0], System.Text.RegularExpressions.Regex.Replace(coinposition, @"[^\d.\d]", "")); //汇率数据肯定取得是带小数点的数据嘛,所以用了正则表达式取出来了 } } #endregion } catch (Exception ex) { return ex.message.tostring(); } } ///下载文件到本地路径 private void DownDocumentPdf(string filepath, string url) { // 设置参数 HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest; //发送请求并获取相应回应数据 HttpWebResponse response = request.GetResponse() as HttpWebResponse; //直到request.GetResponse()程序才开始向目标网页发送Post请求 Stream responseStream = response.GetResponseStream(); Stream stream = new FileStream(filepath, FileMode.Create); byte[] bArr = new byte[1024]; int size = responseStream.Read(bArr, 0, (int)bArr.Length); while (size > 0) { stream.Write(bArr, 0, size); size = responseStream.Read(bArr, 0, (int)bArr.Length); } stream.Close(); responseStream.Close(); }