HtmlAgilityPack obtain Title and meta
12,434
Solution 1
Go about it this way:
HtmlNode mdnode = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='description']");
if (mdnode != null)
{
HtmlAttribute desc;
desc = mdnode.Attributes["content"];
string fulldescription = desc.Value;
Console.Write("DESCRIPTION: " + fulldescription);
}
Solution 2
Have you used a breakpoint and gone line for line to see where the error might be occurring?
If you have, then Try something like this:
string result = string.Empty;
HttpWebRequest request = (HttpWebRequest)WebRequest.Create("http://www.google.com");
request.Method = "GET";
try
{
using (var stream = request.GetResponse().GetResponseStream())
using (var reader = new StreamReader(stream, Encoding.UTF8))
{
result = reader.ReadToEnd();
}
}
HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument();
htmlDoc.LoadHtml(result);
Then carry over the rest of your code below the htmlDoc.LoadHtml
Solution 3
I think your problem is here:
htmlDoc.LoadHtml(link); //chargement de HTMLAgilityPack
It should be:
htmlDoc.LoadHtml(html); //chargement de HTMLAgilityPack
LoadHtml expects a string with the HTML source, not the url.
And probably you want to change:
var html = client.DownloadString(url);
to
var html = client.DownloadString(link);
Solution 4
[HttpPost]
public ActionResult Create(WebSite website)
{
string desc = HtmlAgi(website.Url, "description");
string keyword = HtmlAgi(website.Url, "Keywords");
if (ModelState.IsValid)
{
var userId = ((CustomPrincipal)User).UserId;
r.Create(new WebSite
{
Description = desc,
Tags = keyword,
Url = website.Url,
UserId = userId,
Category = website.Category
});
return RedirectToAction("Index");
}
return View(website);
}
Comments
-
pi-2r almost 2 years
I try to practice "HtmlAgilityPack ", but I am having some issues regarding this. here's what I coded, but I can not get correctly the title and the description of a web page ... If someone can enlighten me on my mistake :)
... public static void Main(string[] args) { string link = null; string str; string answer; int curloc; // holds current location in response string url = "http://stackoverflow.com/"; try { do { HttpWebRequest HttpWReq = (HttpWebRequest)WebRequest.Create(url); HttpWReq.UserAgent = @"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.5) Gecko/20091102 Firefox/3.5.5"; HttpWebResponse HttpWResp = (HttpWebResponse)HttpWReq.GetResponse(); //url = null; // disallow further use of this URI Stream istrm = HttpWResp.GetResponseStream(); // Wrap the input stream in a StreamReader. StreamReader rdr = new StreamReader(istrm); // Read in the entire page. str = rdr.ReadToEnd(); curloc = 0; //WebPage result; do { // Find the next URI to link to. link = FindLink(str, ref curloc); //return the good link Console.WriteLine("Title found: " + curloc); //title = Title(str, ref curloc); if (link != null) { Console.WriteLine("Link found: " + link); using (System.Net.WebClient client = new System.Net.WebClient()) { HtmlDocument htmlDoc = new HtmlDocument(); var html = client.DownloadString(url); htmlDoc.LoadHtml(link); //chargement de HTMLAgilityPack var htmlElement = htmlDoc.DocumentNode.Element("html"); HtmlNode node = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='description']"); if (node != null) { string desc = node.GetAttributeValue("content", ""); Console.Write("DESCRIPTION: " + desc); } else { Console.WriteLine("No description"); } var titleElement = htmlDoc.DocumentNode .Element("html") .Element("head") .Element("title"); if (titleElement != null) { string title = titleElement.InnerText; Console.WriteLine("Titre: {0}", title); } else { Console.WriteLine("no Title"); } Console.Write("Done"); } Console.Write("Link, More, Quit?"); answer = Console.ReadLine(); } else { Console.WriteLine("No link found."); break; } } while (link.Length > 0); // Close the Response. HttpWResp.Close(); } while (url != null); } catch{ ...}
Thanks in advance :)