How can I Convert HTML to Text in C#?
Solution 1
What you are looking for is a text-mode DOM renderer that outputs text, much like Lynx or other Text browsers...This is much harder to do than you would expect.
Solution 2
Just a note about the HtmlAgilityPack for posterity. The project contains an example of parsing text to html, which, as noted by the OP, does not handle whitespace at all like anyone writing HTML would envisage. There are full-text rendering solutions out there, noted by others to this question, which this is not (it cannot even handle tables in its current form), but it is lightweight and fast, which is all I wanted for creating a simple text version of HTML emails.
using System.IO;
using System.Text.RegularExpressions;
using HtmlAgilityPack;
//small but important modification to class https://github.com/zzzprojects/html-agility-pack/blob/master/src/Samples/Html2Txt/HtmlConvert.cs
public static class HtmlToText
{
public static string Convert(string path)
{
HtmlDocument doc = new HtmlDocument();
doc.Load(path);
return ConvertDoc(doc);
}
public static string ConvertHtml(string html)
{
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(html);
return ConvertDoc(doc);
}
public static string ConvertDoc (HtmlDocument doc)
{
using (StringWriter sw = new StringWriter())
{
ConvertTo(doc.DocumentNode, sw);
sw.Flush();
return sw.ToString();
}
}
internal static void ConvertContentTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo)
{
foreach (HtmlNode subnode in node.ChildNodes)
{
ConvertTo(subnode, outText, textInfo);
}
}
public static void ConvertTo(HtmlNode node, TextWriter outText)
{
ConvertTo(node, outText, new PreceedingDomTextInfo(false));
}
internal static void ConvertTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo)
{
string html;
switch (node.NodeType)
{
case HtmlNodeType.Comment:
// don't output comments
break;
case HtmlNodeType.Document:
ConvertContentTo(node, outText, textInfo);
break;
case HtmlNodeType.Text:
// script and style must not be output
string parentName = node.ParentNode.Name;
if ((parentName == "script") || (parentName == "style"))
{
break;
}
// get text
html = ((HtmlTextNode)node).Text;
// is it in fact a special closing node output as text?
if (HtmlNode.IsOverlappedClosingElement(html))
{
break;
}
// check the text is meaningful and not a bunch of whitespaces
if (html.Length == 0)
{
break;
}
if (!textInfo.WritePrecedingWhiteSpace || textInfo.LastCharWasSpace)
{
html= html.TrimStart();
if (html.Length == 0) { break; }
textInfo.IsFirstTextOfDocWritten.Value = textInfo.WritePrecedingWhiteSpace = true;
}
outText.Write(HtmlEntity.DeEntitize(Regex.Replace(html.TrimEnd(), @"\s{2,}", " ")));
if (textInfo.LastCharWasSpace = char.IsWhiteSpace(html[html.Length - 1]))
{
outText.Write(' ');
}
break;
case HtmlNodeType.Element:
string endElementString = null;
bool isInline;
bool skip = false;
int listIndex = 0;
switch (node.Name)
{
case "nav":
skip = true;
isInline = false;
break;
case "body":
case "section":
case "article":
case "aside":
case "h1":
case "h2":
case "header":
case "footer":
case "address":
case "main":
case "div":
case "p": // stylistic - adjust as you tend to use
if (textInfo.IsFirstTextOfDocWritten)
{
outText.Write("\r\n");
}
endElementString = "\r\n";
isInline = false;
break;
case "br":
outText.Write("\r\n");
skip = true;
textInfo.WritePrecedingWhiteSpace = false;
isInline = true;
break;
case "a":
if (node.Attributes.Contains("href"))
{
string href = node.Attributes["href"].Value.Trim();
if (node.InnerText.IndexOf(href, StringComparison.InvariantCultureIgnoreCase)==-1)
{
endElementString = "<" + href + ">";
}
}
isInline = true;
break;
case "li":
if(textInfo.ListIndex>0)
{
outText.Write("\r\n{0}.\t", textInfo.ListIndex++);
}
else
{
outText.Write("\r\n*\t"); //using '*' as bullet char, with tab after, but whatever you want eg "\t->", if utf-8 0x2022
}
isInline = false;
break;
case "ol":
listIndex = 1;
goto case "ul";
case "ul": //not handling nested lists any differently at this stage - that is getting close to rendering problems
endElementString = "\r\n";
isInline = false;
break;
case "img": //inline-block in reality
if (node.Attributes.Contains("alt"))
{
outText.Write('[' + node.Attributes["alt"].Value);
endElementString = "]";
}
if (node.Attributes.Contains("src"))
{
outText.Write('<' + node.Attributes["src"].Value + '>');
}
isInline = true;
break;
default:
isInline = true;
break;
}
if (!skip && node.HasChildNodes)
{
ConvertContentTo(node, outText, isInline ? textInfo : new PreceedingDomTextInfo(textInfo.IsFirstTextOfDocWritten){ ListIndex = listIndex });
}
if (endElementString != null)
{
outText.Write(endElementString);
}
break;
}
}
}
internal class PreceedingDomTextInfo
{
public PreceedingDomTextInfo(BoolWrapper isFirstTextOfDocWritten)
{
IsFirstTextOfDocWritten = isFirstTextOfDocWritten;
}
public bool WritePrecedingWhiteSpace {get;set;}
public bool LastCharWasSpace { get; set; }
public readonly BoolWrapper IsFirstTextOfDocWritten;
public int ListIndex { get; set; }
}
internal class BoolWrapper
{
public BoolWrapper() { }
public bool Value { get; set; }
public static implicit operator bool(BoolWrapper boolWrapper)
{
return boolWrapper.Value;
}
public static implicit operator BoolWrapper(bool boolWrapper)
{
return new BoolWrapper{ Value = boolWrapper };
}
}
As an example, the following HTML code...
<!DOCTYPE HTML>
<html>
<head>
</head>
<body>
<header>
Whatever Inc.
</header>
<main>
<p>
Thanks for your enquiry. As this is the 1<sup>st</sup> time you have contacted us, we would like to clarify a few things:
</p>
<ol>
<li>
Please confirm this is your email by replying.
</li>
<li>
Then perform this step.
</li>
</ol>
<p>
Please solve this <img alt="complex equation" src="http://upload.wikimedia.org/wikipedia/commons/8/8d/First_Equation_Ever.png"/>. Then, in any order, could you please:
</p>
<ul>
<li>
a point.
</li>
<li>
another point, with a <a href="http://en.wikipedia.org/wiki/Hyperlink">hyperlink</a>.
</li>
</ul>
<p>
Sincerely,
</p>
<p>
The whatever.com team
</p>
</main>
<footer>
Ph: 000 000 000<br/>
mail: whatever st
</footer>
</body>
</html>
...will be transformed into:
Whatever Inc.
Thanks for your enquiry. As this is the 1st time you have contacted us, we would like to clarify a few things:
1. Please confirm this is your email by replying.
2. Then perform this step.
Please solve this [complex equation<http://upload.wikimedia.org/wikipedia/commons/8/8d/First_Equation_Ever.png>]. Then, in any order, could you please:
* a point.
* another point, with a hyperlink<http://en.wikipedia.org/wiki/Hyperlink>.
Sincerely,
The whatever.com team
Ph: 000 000 000
mail: whatever st
...as opposed to:
Whatever Inc.
Thanks for your enquiry. As this is the 1st time you have contacted us, we would like to clarify a few things:
Please confirm this is your email by replying.
Then perform this step.
Please solve this . Then, in any order, could you please:
a point.
another point, with a hyperlink.
Sincerely,
The whatever.com team
Ph: 000 000 000
mail: whatever st
Solution 3
You could use this:
public static string StripHTML(string HTMLText, bool decode = true)
{
Regex reg = new Regex("<[^>]+>", RegexOptions.IgnoreCase);
var stripped = reg.Replace(HTMLText, "");
return decode ? HttpUtility.HtmlDecode(stripped) : stripped;
}
Updated
Thanks for the comments I have updated to improve this function
Solution 4
I've heard from a reliable source that, if you're doing HTML parsing in .Net, you should look at the HTML agility pack again..
http://www.codeplex.com/htmlagilitypack
Some sample on SO..
HTML Agility pack - parsing tables
Solution 5
Assuming you have well formed html, you could also maybe try an XSL transform.
Here's an example:
using System;
using System.IO;
using System.Xml.Linq;
using System.Xml.XPath;
using System.Xml.Xsl;
class Html2TextExample
{
public static string Html2Text(XDocument source)
{
var writer = new StringWriter();
Html2Text(source, writer);
return writer.ToString();
}
public static void Html2Text(XDocument source, TextWriter output)
{
Transformer.Transform(source.CreateReader(), null, output);
}
public static XslCompiledTransform _transformer;
public static XslCompiledTransform Transformer
{
get
{
if (_transformer == null)
{
_transformer = new XslCompiledTransform();
var xsl = XDocument.Parse(@"<?xml version='1.0'?><xsl:stylesheet version=""1.0"" xmlns:xsl=""http://www.w3.org/1999/XSL/Transform"" exclude-result-prefixes=""xsl""><xsl:output method=""html"" indent=""yes"" version=""4.0"" omit-xml-declaration=""yes"" encoding=""UTF-8"" /><xsl:template match=""/""><xsl:value-of select=""."" /></xsl:template></xsl:stylesheet>");
_transformer.Load(xsl.CreateNavigator());
}
return _transformer;
}
}
static void Main(string[] args)
{
var html = XDocument.Parse("<html><body><div>Hello world!</div></body></html>");
var text = Html2Text(html);
Console.WriteLine(text);
}
}
Matt Crouch
I work on this now: https://github.com/compositactic/Compositactic
Updated on July 05, 2022Comments
-
Matt Crouch almost 2 years
I'm looking for C# code to convert an HTML document to plain text.
I'm not looking for simple tag stripping , but something that will output plain text with a reasonable preservation of the original layout.
The output should look like this:
I've looked at the HTML Agility Pack, but I don't think that's what I need. Does anyone have any other suggestions?
EDIT: I just download the HTML Agility Pack from CodePlex, and ran the Html2Txt project. What a disappointment (at least the module that does html to text conversion)! All it did was strip the tags, flatten the tables, etc. The output didn't look anything like the Html2Txt @ W3C produced. Too bad that source doesn't seem to be available. I was looking to see if there is a more "canned" solution available.
EDIT 2: Thank you everybody for your suggestions. FlySwat tipped me in the direction i wanted to go. I can use the
System.Diagnostics.Process
class to run lynx.exe with the "-dump" switch to send the text to standard output, and capture the stdout withProcessStartInfo.UseShellExecute = false
andProcessStartInfo.RedirectStandardOutput = true
. I'll wrap all this in a C# class. This code will be called only occassionly, so i'm not too concerned about spawning a new process vs. doing it in code. Plus, Lynx is FAST!! -
ShigaSuresh about 15 yearsgood thinking, it actually pretty easy to do a rough version.
-
EricSchaefer about 15 yearsWell it depends on the HTML. I wrote a quick and dirty version of this approach in php for a CMS that was sending weekly digest of the post by plain text email. In this case the editor for the posts only allowed certain HTML elements. It should be much harder if full HTML transitional is allowed.
-
Matt Crouch about 15 yearsThis is closer to what i'm looking for, but this still "flattened" the html tables. :(
-
madcolor about 15 yearsGood Luck.. I don't pretend that it's going to be easy.. but I think it's the correct path to go down.
-
Matt Crouch about 15 yearsNope, it actually makes it easier!! (see question edit). Thanks again!
-
Admin almost 14 yearsMax, be clear that this is your product that you are recommending. All of your answers IIRC are you suggesting this product. The SO community is pretty protective and sensitive to spamming/astroturfing. If you are not clear, and if all you do here is suggest people buy your software, you are going to end up doing yourself more harm than good.
-
Maxim Sautin almost 14 yearsHi Will! Yes, it's my product - you are right, sorry that this post looks like a advertising. I'll change it right now to make it wihtout any advertising.
-
Riko about 12 yearsThis is incomplete... for example it does not account for entities like etc...
-
Suren about 12 yearsIt was awesome, and will be better if combine with HtmlDecoded, I mean : 'HTMLText = HttpUtility.HtmlDecode(HTMLText);'
-
Steven Combs over 11 yearsThis is actually a great example! I used this in my web application. All of our content is stored as HTML in a database. A more direct example is using it like this. string test = HttpUtility.HtmlDecode(StripHTML(htmlText));
-
Igor Mironenko over 9 years@MattCrouch how does it make it easier? The Edit 2 answer in the original question is barely more than a hack - completely unacceptable to me and I suspect almost anyone's situation - would you acknowledge this?
-
Rado over 8 yearsThat doesn't convert HTML into text, it's for converting HTML Encoded strings into plain HTML (tags).
-
Roman Gudkov about 8 yearsIf not in web project you could also try System.Net.WebUtiltiy.HtmlDecode()
-
Ponni Radhakrishnan about 8 yearsPlease how can we adapt this solution to retain tab structure?
-
rboy over 7 yearsYou can also handle
node.name
tr
new line andtd
space to improve formatting within tables. -
chenk over 7 yearsif want to use WebUtility in Portable class library, you can use this nuget package. nuget.org/packages/PCLWebUtility
-
Fandango68 over 7 yearsThat worked so easily. Thank you! Way better than most other SO answers
-
Lucifer over 7 yearscan you post a code which returns formatted output string i.e. same amount of new line space etc.
-
Ian Warburton about 7 yearsLooks interesting although it doesn't handle HR.
-
Eric Lloyd over 2 yearsThere's a sketchy bit in the code as currently shown --
if(textInfo.LastCharWasSpace = char.IsWhiteSpace(html[html.Length - 1]))
(note single=
) appears to be side-effecting an assignment inside theif()
statement. It's a code smell that makes it difficult to infer intent vs. typo.