关于使用HtmlAgilityPack

dean_carl · 发表于 2016-1-26 15:17:00

/// <summary>
      /// 根据输入的地址获取其文档节点对象
      /// </summary>
      /// <param name="url">地址</param>
      /// <returns></returns>
      public static HtmlAgilityPack.HtmlNode GetHtmlNodeFromLink(string url)
      {
         try{
            Uri uri = new Uri(url);

            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(uri);
            WebResponse response = request.GetResponse();

            Stream stream = response.GetResponseStream();
            StreamReader read = new StreamReader(stream, Encoding.GetEncoding("gb2312"));
            string str = read.ReadToEnd();

            HtmlAgilityPack.HtmlDocument html = new HtmlAgilityPack.HtmlDocument();
            html.LoadHtml(str);
            return html.DocumentNode;
         }
         catch{return null;}
      }

      /// <summary>
      /// 根据输入的URL地址输出指定XPATH下的节点集合
      /// </summary>
      /// <param name="url">地址</param>
      /// <param name="xPath">过滤地址</param>
      /// <param name="imgs">过滤地址</param>
      /// <param name="links">过滤地址</param>
      /// <param name="title">标题</param>
      /// <returns></returns>
      public static bool GetGalleryInfo(HtmlAgilityPack.HtmlNode htmlNode,string xPath,ref string[] imgs, ref string[] links,ref string[] title)
      {
         try
         {
            HtmlNodeCollection hnc = htmlNode.SelectNodes(xPath);//"//div[@class='slideBannerA homeSlideAD1']"
            if (hnc.Count < 1)
                  return false;
            links = new string[hnc.Count];
            title = new string[hnc.Count];
            imgs = new string[hnc.Count];
            int i = 0;
            string cateDataRegex = @"background-image:url\((?<image>.+)\)";
            Regex re = new Regex(cateDataRegex, RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.IgnorePatternWhitespace);
            foreach (HtmlNode node in hnc)
            {
                  HtmlAttributeCollection hac = node.Attributes;
                  links[i] = hac["href"].Value;
                  imgs[i] = hac["style"] == null ? hac["src2"].Value : re.Match(hac["style"].Value).Groups["image"].Value;
                  title[i++] = string.IsNullOrEmpty(hac["title"].Value) ? hac["alt"].Value : hac["title"].Value;
            }
            return true;
         }
         catch { return false; }
      }

      //调用

         string[] strLink;
         string[] strLinAlt;
         string[] strImg;
         string urls = "http://www.newegg.com.cn";
         HtmlAgilityPack.HtmlNode nodes = GetHtmlNodeFromLink(urls);
         GetGalleryInfo(nodes, "//div[@class='slideBannerA homeSlideAD1']/div[1]/div[1]/a", out strImg, out strLink,out strLinAlt);

		自动登录	找回密码
密码			马上注册

[Asp.Net] 关于使用HtmlAgilityPack