HtmlAgilityPack obtain Title and meta

12,434

Solution 1

Go about it this way:

HtmlNode mdnode = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='description']");

              if (mdnode != null)
              {
                 HtmlAttribute desc;

                 desc = mdnode.Attributes["content"];
                 string fulldescription = desc.Value;
                 Console.Write("DESCRIPTION: " + fulldescription);
              }

Solution 2

Have you used a breakpoint and gone line for line to see where the error might be occurring?

If you have, then Try something like this:

string result = string.Empty;
    HttpWebRequest request = (HttpWebRequest)WebRequest.Create("http://www.google.com");
        request.Method = "GET";
        try
        {
            using (var stream = request.GetResponse().GetResponseStream())

            using (var reader = new StreamReader(stream, Encoding.UTF8))
            {
                result = reader.ReadToEnd();
            }
        }
       HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument();
       htmlDoc.LoadHtml(result);

Then carry over the rest of your code below the htmlDoc.LoadHtml

Solution 3

I think your problem is here:

htmlDoc.LoadHtml(link); //chargement de HTMLAgilityPack

It should be:

 htmlDoc.LoadHtml(html); //chargement de HTMLAgilityPack

LoadHtml expects a string with the HTML source, not the url.

And probably you want to change:

var html = client.DownloadString(url);

to

var html = client.DownloadString(link);

Solution 4

 [HttpPost]
    public ActionResult Create(WebSite website)
    {



        string desc = HtmlAgi(website.Url, "description");
        string keyword = HtmlAgi(website.Url, "Keywords");

        if (ModelState.IsValid)
        {

            var userId = ((CustomPrincipal)User).UserId;
            r.Create(new WebSite
            {
                Description = desc,
                Tags = keyword,
                Url = website.Url,
                UserId = userId,
                Category = website.Category

            });

            return RedirectToAction("Index");
        }

        return View(website);
    }
Share:
12,434
pi-2r
Author by

pi-2r

Sorcerer's apprentice in Java.

Updated on July 10, 2022

Comments

  • pi-2r
    pi-2r almost 2 years

    I try to practice "HtmlAgilityPack ", but I am having some issues regarding this. here's what I coded, but I can not get correctly the title and the description of a web page ... If someone can enlighten me on my mistake :)

    ...
    public static void Main(string[] args)
        {
            string link = null;
            string str;
            string answer;
    
            int curloc; // holds current location in response 
            string url = "http://stackoverflow.com/";
    
            try
            {
    
                do
                {
                    HttpWebRequest HttpWReq = (HttpWebRequest)WebRequest.Create(url);
                    HttpWReq.UserAgent = @"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.5) Gecko/20091102 Firefox/3.5.5";
                    HttpWebResponse HttpWResp = (HttpWebResponse)HttpWReq.GetResponse();
                    //url = null; // disallow further use of this URI 
                    Stream istrm = HttpWResp.GetResponseStream();
                    // Wrap the input stream in a StreamReader. 
                    StreamReader rdr = new StreamReader(istrm);
    
                    // Read in the entire page. 
                    str = rdr.ReadToEnd();
    
                    curloc = 0;
                    //WebPage result;
                    do
                    {
                        // Find the next URI to link to. 
                        link = FindLink(str, ref curloc); //return the good link
                        Console.WriteLine("Title found: " + curloc);
                        //title = Title(str, ref curloc);
    
                        if (link != null)
                        {
                            Console.WriteLine("Link found: " + link);
                            using (System.Net.WebClient client = new System.Net.WebClient())
                            {
                                HtmlDocument htmlDoc = new HtmlDocument();
                                var html = client.DownloadString(url);
                                htmlDoc.LoadHtml(link); //chargement de HTMLAgilityPack
                                var htmlElement = htmlDoc.DocumentNode.Element("html");
    
                                HtmlNode node = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='description']");
                                if (node != null)
                                {
                                    string desc = node.GetAttributeValue("content", "");
                                    Console.Write("DESCRIPTION: " + desc);
                                }
                                else
                                {
                                    Console.WriteLine("No description");
                                }
    
                                var titleElement =
                                                    htmlDoc.DocumentNode
                                                       .Element("html")
                                                       .Element("head")
                                                       .Element("title");
                                if (titleElement != null)
                                {
                                    string title = titleElement.InnerText;
                                    Console.WriteLine("Titre: {0}", title);
                                }
                                else
                                {
                                    Console.WriteLine("no Title");
                                }
                                Console.Write("Done");
                            }
                            Console.Write("Link, More, Quit?");
                            answer = Console.ReadLine();
                        }
                        else
                        {
                            Console.WriteLine("No link found.");
                            break;
                        }
                    } while (link.Length > 0);
    
                    // Close the Response.
                    HttpWResp.Close();
                } while (url != null); 
            }
    catch{ ...}
    

    Thanks in advance :)