本文实例讲述了c#实现将html转换成纯文本的方法。分享给大家供大家参考。具体如下:
使用方法:
htmltotext convert = new htmltotext();
textbox2.text = convert.convert(textbox1.text);
c#代码如下:
/// <summary>
/// converts html to plain text.
/// </summary>
class htmltotext
{
// static data tables
protected static dictionary<string, string> _tags;
protected static hashset<string> _ignoretags;
// instance variables
protected textbuilder _text;
protected string _html;
protected int _pos;
// static constructor (one time only)
static htmltotext()
{
_tags = new dictionary<string, string>();
_tags.add(address, \n);
_tags.add(blockquote, \n);
_tags.add(div, \n);
_tags.add(dl, \n);
_tags.add(fieldset, \n);
_tags.add(form, \n);
_tags.add(h1, \n);
_tags.add(/h1, \n);
_tags.add(h2, \n);
_tags.add(/h2, \n);
_tags.add(h3, \n);
_tags.add(/h3, \n);
_tags.add(h4, \n);
_tags.add(/h4, \n);
_tags.add(h5, \n);
_tags.add(/h5, \n);
_tags.add(h6, \n);
_tags.add(/h6, \n);
_tags.add(p, \n);
_tags.add(/p, \n);
_tags.add(table, \n);
_tags.add(/table, \n);
_tags.add(ul, \n);
_tags.add(/ul, \n);
_tags.add(ol, \n);
_tags.add(/ol, \n);
_tags.add(/li, \n);
_tags.add(br, \n);
_tags.add(/td, \t);
_tags.add(/tr, \n);
_tags.add(/pre, \n);
_ignoretags = new hashset<string>();
_ignoretags.add(script);
_ignoretags.add(noscript);
_ignoretags.add(style);
_ignoretags.add(object);
}
/// <summary>
/// converts the given html to plain text and returns the result.
/// </summary>
/// <param name=html>html to be converted</param>
/// <returns>resulting plain text</returns>
public string convert(string html)
{
// initialize state variables
_text = new textbuilder();
_html = html;
_pos = 0;
// process input
while (!endoftext)
{
if (peek() == '<')
{
// html tag
bool selfclosing;
string tag = parsetag(out selfclosing);
// handle special tag cases
if (tag == body)
{
// discard content before <body>
_text.clear();
}
else if (tag == /body)
{
// discard content after </body>
_pos = _html.length;
}
else if (tag == pre)
{
// enter preformatted mode
_text.preformatted = true;
eatwhitespacetonextline();
}
else if (tag == /pre)
{
// exit preformatted mode
_text.preformatted = false;
}
string value;
if (_tags.trygetvalue(tag, out value))
_text.write(value);
if (_ignoretags.contains(tag))
eatinnercontent(tag);
}
else if (char.iswhitespace(peek()))
{
// whitespace (treat all as space)
_text.write(_text.preformatted ? peek() : ' ');
moveahead();
}
else
{
// other text
_text.write(peek());
moveahead();
}
}
// return result
return httputility.htmldecode(_text.tostring());
}
// eats all characters that are part of the current tag
// and returns information about that tag
protected string parsetag(out bool selfclosing)
{
string tag = string.empty;
selfclosing = false;
if (peek() == '<')
{
moveahead();
// parse tag name
eatwhitespace();
int start = _pos;
if (peek() == '/')
moveahead();
while (!endoftext && !char.iswhitespace(peek()) &&
peek() != '/' && peek() != '>')
moveahead();
tag = _html.substring(start, _pos - start).tolower();
// parse rest of tag
while (!endoftext && peek() != '>')
{
if (peek() == '' || peek() == '\'')
eatquotedvalue();
else
{
if (peek() == '/')
selfclosing = true;
moveahead();
}
}
moveahead();
}
return tag;
}
// consumes inner content from the current tag
protected void eatinnercontent(string tag)
{
string endtag = / + tag;
while (!endoftext)
{
if (peek() == '<')
{
// consume a tag
bool selfclosing;
if (parsetag(out selfclosing) == endtag)
return;
// use recursion to consume nested tags
if (!selfclosing && !tag.startswith(/))
eatinnercontent(tag);
}
else moveahead();
}
}
// returns true if the current position is at the end of
// the string
protected bool endoftext
{
get { return (_pos >= _html.length); }
}
// safely returns the character at the current position
protected char peek()
{
return (_pos < _html.length) ? _html[_pos] : (char)0;
}
// safely advances to current position to the next character
protected void moveahead()
{
_pos = math.min(_pos + 1, _html.length);
}
// moves the current position to the next non-whitespace
// character.
protected void eatwhitespace()
{
while (char.iswhitespace(peek()))
moveahead();
}
// moves the current position to the next non-whitespace
// character or the start of the next line, whichever
// comes first
protected void eatwhitespacetonextline()
{
while (char.iswhitespace(peek()))
{
char c = peek();
moveahead();
if (c == '\n')
break;
}
}
// moves the current position past a quoted value
protected void eatquotedvalue()
{
char c = peek();
if (c == '' || c == '\'')
{
// opening quote
moveahead();
// find end of value
int start = _pos;
_pos = _html.indexofany(new char[] { c, '\r', '\n' }, _pos);
if (_pos < 0)
_pos = _html.length;
else
moveahead(); // closing quote
}
}
/// <summary>
/// a stringbuilder class that helps eliminate excess whitespace.
/// </summary>
protected class textbuilder
{
private stringbuilder _text;
private stringbuilder _currline;
private int _emptylines;
private bool _preformatted;
// construction
public textbuilder()
{
_text = new stringbuilder();
_currline = new stringbuilder();
_emptylines = 0;
_preformatted = false;
}
/// <summary>
/// normally, extra whitespace characters are discarded.
/// if this property is set to true, they are passed
/// through unchanged.
/// </summary>
public bool preformatted
{
get
{
return _preformatted;
}
set
{
if (value)
{
// clear line buffer if changing to
// preformatted mode
if (_currline.length > 0)
flushcurrline();
_emptylines = 0;
}
_preformatted = value;
}
}
/// <summary>
/// clears all current text.
/// </summary>
public void clear()
{
_text.length = 0;
_currline.length = 0;
_emptylines = 0;
}
/// <summary>
/// writes the given string to the output buffer.
/// </summary>
/// <param name=s></param>
public void write(string s)
{
foreach (char c in s)
write(c);
}
/// <summary>
/// writes the given character to the output buffer.
/// </summary>
/// <param name=c>character to write</param>
public void write(char c)
{
if (_preformatted)
{
// write preformatted character
_text.append(c);
}
else
{
if (c == '\r')
{
// ignore carriage returns. we'll process
// '\n' if it comes next
}
else if (c == '\n')
{
// flush current line
flushcurrline();
}
else if (char.iswhitespace(c))
{
// write single space character
int len = _currline.length;
if (len == 0 || !char.iswhitespace(_currline[len - 1]))
_currline.append(' ');
}
else
{
// add character to current line
_currline.append(c);
}
}
}
// appends the current line to output buffer
protected void flushcurrline()
{
// get current line
string line = _currline.tostring().trim();
// determine if line contains non-space characters
string tmp = line.replace( , string.empty);
if (tmp.length == 0)
{
// an empty line
_emptylines++;
if (_emptylines < 2 && _text.length > 0)
_text.appendline(line);
}
else
{
// a non-empty line
_emptylines = 0;
_text.appendline(line);
}
// reset current line
_currline.length = 0;
}
/// <summary>
/// returns the current output as a string.
/// </summary>
public override string tostring()
{
if (_currline.length > 0)
flushcurrline();
return _text.tostring();
}
}
}
希望本文所述对大家的c#程序设计有所帮助。
更多c#实现将html转换成纯文本的方法。