using HtmlAgilityPack;
namespace ConsoleAppTest.SubClass
{
/// <summary>
/// C#爬虫通过URL爬出页面中A标签的链接地址与链接名称
/// </summary>
public class HdhCmsSimpleCrawler
{
static async Task Main(string[] args)
{
await CollectContent();
}
public static async Task CollectContent()
{
// 设置要枛的网址
string url = "http://www.hdhcms.com";
// 创建HttpClient并设置请求头
using var client = new HttpClient();
client.DefaultRequestHeaders.Add("User-Agent", "Mozilla/5.0");
try
{
// 获取HTML内容
string hdhcmsHtml = await client.GetStringAsync(url);
Console.WriteLine("爬下来的内容:");
Console.WriteLine(hdhcmsHtml);
// 使用HtmlAgilityPack解析
var doc = new HtmlDocument();
doc.LoadHtml(hdhcmsHtml);
// 提取所有<a>标签
var allLinks = doc.DocumentNode.SelectNodes("//a[@href]");
if (allLinks != null)
{
Console.WriteLine($"发现{allLinks.Count}个链接:");
foreach (var link in allLinks)
{
string getHref = link.Attributes["href"].Value;
string getText = link.InnerText.Trim();
if (!string.IsNullOrEmpty(getText))
Console.WriteLine($"{getText} -> {getHref}");
}
}
}
catch (Exception ex)
{
Console.WriteLine($"抓取失败:{ex.Message}");
}
}
}
}
查看更多关于C#爬虫通过URL爬出页面中A标签的链接地址与链接名称的详细内容...