C#爬虫通过URL爬出页面中A标签的链接地址与链接名称

using HtmlAgilityPack;

namespace ConsoleAppTest.SubClass

{

/// <summary>

/// C#爬虫通过URL爬出页面中A标签的链接地址与链接名称

/// </summary>

public class HdhCmsSimpleCrawler

{

static async Task Main(string[] args)

{

await CollectContent();

}

public static async Task CollectContent()

{

// 设置要枛的网址

string url = "http://www.hdhcms.com";

// 创建HttpClient并设置请求头

using var client = new HttpClient();

client.DefaultRequestHeaders.Add("User-Agent", "Mozilla/5.0");

try

{

// 获取HTML内容

string hdhcmsHtml = await client.GetStringAsync(url);

Console.WriteLine("爬下来的内容：");

Console.WriteLine(hdhcmsHtml);

// 使用HtmlAgilityPack解析

var doc = new HtmlDocument();

doc.LoadHtml(hdhcmsHtml);

// 提取所有<a>标签

var allLinks = doc.DocumentNode.SelectNodes("//a[@href]");

if (allLinks != null)

{

Console.WriteLine($"发现{allLinks.Count}个链接：");

foreach (var link in allLinks)

{

string getHref = link.Attributes["href"].Value;

string getText = link.InnerText.Trim();

if (!string.IsNullOrEmpty(getText))

Console.WriteLine($"{getText} -> {getHref}");

}

catch (Exception ex)

{

Console.WriteLine($"抓取失败：{ex.Message}");

}

声明：本文来自网络，不代表【好得很程序员自学网】立场，转载请注明出处：http://www.haodehen.cn/did255633

更新时间：2025-08-01 阅读：106次