发布:2020/12/3 16:29:00作者:管理员 来源:本站 浏览次数:910
using HtmlAgilityPack;
using OpenQA.Selenium;
using OpenQA.Selenium.Chrome;
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading;
using System.Threading.Tasks;
using System.Windows.Forms;
namespace BgCollection
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
}
private void button1_Click(object sender, EventArgs e)
{
string xx = FinalHtml.GetMobileHtml(textBox1.Text, 1);
//full-content 内容
//wgt-question-title 标题
/*
<div class="wgt-question-title">
<h2>甲醛检测一次多少钱</h2>
</div>
wgt-question-desc-inner
<div class="wgt-question-desc-inner">
甲醛检测一次多少钱
</div>
*/
string strContent = xx;
Regex regex = new Regex("\r\n");
strContent = regex.Replace(strContent, "");
//string reg = "<(?<HtmlTag>[\\w]+)[^>]*\\s[class]=(?<Quote>[\"]?)full-content(?(Quote)\\k<Quote>)[\"]?[^>]*>(((?<Nested><\\k<HtmlTag>[^>]*>)|</\\k<HtmlTag>>(?<-Nested>)|.*?)*)</\\k<HtmlTag>>";
//MatchCollection m = Regex.Matches(strContent, reg, RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.Singleline);
Regex r = new Regex("<div[^>]*>[^<>]*(((?'Open'<div[^>]*>)[^<>]*)+((?'-Open'</div>)[^<>]*)+)*(?(Open)(?!))</div>");
StringBuilder sb = new StringBuilder();
MatchString("<div><div class='123'><div class='234'>234</div></div></div>", r, sb);
//GetHtmlTagAttr(xx, "full-content", "");
Regex rg = new Regex("<div[^>]*?class=\"full-content\"[^>]*?>(([^<]*(?(?!</div>)<))*)</div>", RegexOptions.Multiline | RegexOptions.Singleline);
string _html = rg.Match(xx).Value;
this.Invoke(new Action(() =>
{
richTextBox1.Text += xx + "\n\r";
richTextBox2.Text += Html2Text(GetValue(xx, "<div class=\"wgt-question-title\">", "</div>"));
//richTextBox3.Text += GetText(strContent, "full-content");
//richTextBox3.Text += Html2Text(rg.Match(xx).Value) + "\n\r";//去除html标签
richTextBox3.Text += GetValue(_html, "<div class=\"full-content\">", "</div>") + "\n\r";
}));
}
public static string Html2Text(string htmlStr)
{
if (String.IsNullOrEmpty(htmlStr))
{
return "";
}
string regEx_style = "<style[^>]*?>[\\s\\S]*?<\\/style>"; //定义style的正则表达式
string regEx_script = "<script[^>]*?>[\\s\\S]*?<\\/script>"; //定义script的正则表达式
//string regEx_html = "<[^>]+>"; //定义HTML标签的正则表达式
string regEx_html = @"<(?!\/?br\/?.+?>)[^<>]*>"; //去除HTML Tag,但不去除换行标签<br>
//msg = msg.replace(/<\/? ((? !img).) *?\/?>/ g, ''); //去除HTML Tag,但不去除换行标签<img>(<img />)
htmlStr = Regex.Replace(htmlStr, regEx_style, "");//删除css
htmlStr = Regex.Replace(htmlStr, regEx_script, "");//删除js
htmlStr = Regex.Replace(htmlStr, regEx_html, "");//删除html标记
//htmlStr = Regex.Replace(htmlStr, "\\s*|\t|\r|\n", "");//去除tab、空格、空行
return htmlStr.Trim();
}
private void MatchString(string OutString, Regex r, StringBuilder sb)
{
MatchCollection ms = r.Matches(OutString);// 获取所有的匹配
foreach (Match m in ms)
{
if (m.Success)
{
sb.AppendLine(m.Groups[0].Value);
MatchString(m.Groups[0].Value.Substring(1, m.Groups[0].Value.Length - 1), r, sb);// 去掉匹配到的头和尾的 "[" 和 "]",避免陷入死循环递归中,导致溢出
}
}
return;
}
private void Form1_Load(object sender, EventArgs e)
{
}
/// <summary>
/// 获得字符串中开始和结束字符串中间得值
/// </summary>
/// <param name="str">字符串</param>
/// <param name="s">开始</param>
/// <param name="e">结束</param>
/// <returns></returns>
static string GetValue(string str, string s, string e)
{
Regex rg = new Regex("(?<=(" + s + "))[.\\s\\S]*?(?=(" + e + "))", RegexOptions.Multiline | RegexOptions.Singleline);
return rg.Match(str).Value;
}
static string GetHtmlValue(string str, string s, string e) {
Regex rg = new Regex(s + "(.+?)" + e , RegexOptions.Multiline | RegexOptions.Singleline);
return rg.Match(str).Value;
}
static string GetText(string html, string fildname)
{
#region 获取内容
//<div class=\"m\" id=\"sortlist\">(<div[^>]*>(<div[^>]*>(<div[^>]*>.*?</div>|.)*?</div>|.)*?</div>|.)*?</div>
var _movie_des = "<div(\\s+(title=\"(?<title>[^\"]*?)\"|class=\"(?<class>[^\"]*?)\"|[-\\w]+=\"[^\"]*?\"))*\\s*>(?<text>(.*?))</div>";
var _maths_5 = Regex.Matches(html, _movie_des);
var _content = string.Empty;
var _movie_type = string.Empty;
for (int ii = 0; ii < _maths_5.Count; ii++)
{
var c = _maths_5[ii].Groups["class"].Value;
if (c.Equals(fildname))
{
_content = _maths_5[ii].Groups["text"].Value;
break;
}
}
#endregion
return _content;
}
static string GetDivText(string html, string fildname)
{
#region 获取内容
//<div class=\"m\" id=\"sortlist\">(<div[^>]*>(<div[^>]*>(<div[^>]*>.*?</div>|.)*?</div>|.)*?</div>|.)*?</div>
var _movie_des = "<div[^>]*>[^<>]*(((?'Open'<div[^>]*>)[^<>]*)+((?'-Open'</div>)[^<>]*)+)*(?(Open)(?!))</div>";
var _maths_5 = Regex.Matches(html, _movie_des);
var _content = string.Empty;
var _movie_type = string.Empty;
for (int ii = 0; ii < _maths_5.Count; ii++)
{
var c = _maths_5[ii].Groups["class"].Value;
if (c.Equals(fildname))
{
_content = _maths_5[ii].Groups["text"].Value;
break;
}
}
#endregion
return _content;
}
static string GetXml(string html, string fildname) {
int imgNum = 0;//图片编号
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(html);
string imgPath = "//class";//选择img
string fileName = ".jpg";
//HtmlNode nodes = hd.DocumentNode.SelectSingleNode(path);
//获取img标签中的图片
foreach (HtmlNode node in doc.DocumentNode.SelectNodes(imgPath))
{
if (node.Attributes[fildname] != null)
{
string imgUrl = node.Attributes[fildname].Value.ToString();
if (imgUrl != "" && imgUrl != " ")
{
imgNum++;
var file = DateTime.Now.ToString("yyyyMMddhhssmm") + imgNum + fileName;
}
}
}
return "";
}
/// <summary>
/// 获取字符中指定标签的值
/// </summary>
/// <param name="str">字符串</param>
/// <param name="tagName">标签</param>
/// <param name="attrib">属性名</param>
/// <returns>属性</returns>
public static List<string> GetTagAttr(string str, string tagName, string attrib)
{
string tmpStr = string.Format("<{0}[^>]*?{1}=(['\"\"]?)(?<url>[^'\"\"\\s>]+)\\1[^>]*>", tagName, attrib);
//获取<Script>属性值
MatchCollection titleMatch = Regex.Matches(str, tmpStr, RegexOptions.IgnoreCase);
List<string> list = new List<string>();
foreach (Match m in titleMatch)
{
string result = m.Groups["url"].Value;
if (string.IsNullOrEmpty(result) || list.Contains(result)) continue;
list.Add(result);
}
return list;
}
/// <summary>
/// 获取字符中指定标签的值
/// </summary>
/// <param name="str">字符串</param>
/// <param name="tagName">标签</param>
/// <param name="attrib">属性名</param>
/// <returns>属性</returns>
public static List<string> GetHtmlTagAttr(string str, string classname, string attrib)
{
string tmpStr = string.Format("<div[^>]*?class=\"{0}\"[^>]*?>(([^<]*(?(?!</div>)<))*)</div>", classname, attrib);
//获取<Script>属性值
MatchCollection titleMatch = Regex.Matches(str, tmpStr, RegexOptions.IgnoreCase);
List<string> list = new List<string>();
foreach (Match m in titleMatch)
{
string result = m.Groups["text"].Value;
if (string.IsNullOrEmpty(result) || list.Contains(result)) continue;
list.Add(result);
}
return list;
}
/// <summary>
/// 获取字符中指定标签的值
/// </summary>
/// <param name="str">字符串</param>
/// <param name="tagName">标签</param>
/// <returns>值</returns>
public static List<string> GetTagContent(string str, string tagName)
{
string tmpStr = string.Format("<{0}[^>]*?>(?<Text>[^<]*)</{1}>", tagName, tagName); //获取<Script>之间内容
MatchCollection titleMatch = Regex.Matches(str, tmpStr, RegexOptions.IgnoreCase);
List<string> list = new List<string>();
foreach (Match m in titleMatch)
{
string result = m.Groups["Text"].Value;
if (string.IsNullOrEmpty(result) || list.Contains(result)) continue;
list.Add(result);
}
return list;
}
}
/// <summary>
/// 获得执行过js的网址
/// </summary>
public class FinalHtml
{
public static string GetMobileHtml(string url, int sectionNum) {
//设置手机端浏览模式
var cdSvc = ChromeDriverService.CreateDefaultService();
cdSvc.HideCommandPromptWindow = true;
ChromeMobileEmulationDeviceSettings CMEDS = new ChromeMobileEmulationDeviceSettings();
CMEDS.Width = 320; //设置窗体显示宽高
CMEDS.Height = 800;
CMEDS.PixelRatio = 1.0;
CMEDS.UserAgent = "Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5376e Safari/8536.25";
ChromeOptions options = new ChromeOptions();
options.AddArguments("lang=zh_CN.UTF-8");
//不显示浏览器,无头模式
options.AddArgument("--headless");
options.EnableMobileEmulation(CMEDS);
// 禁用图片
options.AddUserProfilePreference("profile.default_content_setting_values.images", 2);
// GPU加速可能会导致Chrome出现黑屏及CPU占用率过高,所以禁用
options.AddArgument("--disable-gpu");
IWebDriver driver = new ChromeDriver(cdSvc, options);
driver.Navigate().GoToUrl(url);
string title = driver.Title;
Console.WriteLine($"Title: {title}");
//将页面滚动到底部
Console.Write("页面滚动中,请稍后");
for (int i = 1; i <= sectionNum; i++)
{
string jsCode = "window.scrollTo({top: document.body.scrollHeight / " + sectionNum + " * " + i + ", behavior: \"smooth\"});";
IJavaScriptExecutor js = (IJavaScriptExecutor)driver;
js.ExecuteScript(jsCode);
Console.Write(".");
Thread.Sleep(1000);
}
Console.WriteLine();
string html = driver.PageSource;
driver.Quit();
return html;
}
/// <summary>
/// 获得拉动滚动条后的页面
/// </summary>
/// <param name="url">网址</param>
/// <param name="sectionNum">滚动几次</param>
/// <returns>html字符串</returns>
public static string GetFinalHtml(string url, int sectionNum)
{
//不启动chrome窗口
ChromeOptions options = new ChromeOptions();
options.AddArgument("headless");
//关闭ChromeDriver控制台
ChromeDriverService driverService = ChromeDriverService.CreateDefaultService();
driverService.HideCommandPromptWindow = true;
ChromeDriver driver = new ChromeDriver(driverService, options);
driver.Navigate().GoToUrl(url);
string title = driver.Title;
Console.WriteLine($"Title: {title}");
//将页面滚动到底部
Console.Write("页面滚动中,请稍后");
for (int i = 1; i <= sectionNum; i++)
{
string jsCode = "window.scrollTo({top: document.body.scrollHeight / " + sectionNum + " * " + i + ", behavior: \"smooth\"});";
IJavaScriptExecutor js = (IJavaScriptExecutor)driver;
js.ExecuteScript(jsCode);
Console.Write(".");
Thread.Sleep(1000);
}
Console.WriteLine();
string html = driver.PageSource;
driver.Quit();
return html;
}
/// <summary>
/// Get请求
/// </summary>
/// <param name="url"></param>
/// <param name="sectionNum"></param>
/// <returns></returns>
public static async Task<string> GetFinalHtmlAsync(string url, int sectionNum)
{
Task<string> task = Task<string>.Run(() =>
{
//不启动chrome窗口
ChromeOptions options = new ChromeOptions();
options.AddArgument("headless");
//关闭ChromeDriver控制台
ChromeDriverService driverService = ChromeDriverService.CreateDefaultService();
driverService.HideCommandPromptWindow = true;
ChromeDriver driver = new ChromeDriver(driverService, options);
driver.Navigate().GoToUrl(url);
string title = driver.Title;
Console.WriteLine($"Title: {title}");
//将页面滚动到底部
Console.Write("页面滚动中,请稍后");
for (int i = 1; i <= sectionNum; i++)
{
string jsCode = "window.scrollTo({top: document.body.scrollHeight / " + sectionNum + " * " + i + ", behavior: \"smooth\"});";
IJavaScriptExecutor js = (IJavaScriptExecutor)driver;
js.ExecuteScript(jsCode);
Console.Write(".");
Thread.Sleep(1000);
}
Console.WriteLine();
string html = driver.PageSource;
driver.Quit();
return html;
});
return await task;
}
}
}