C#实现将HTML转换成纯文本的方法

本文实例讲述了c#实现将html转换成纯文本的方法。分享给大家供大家参考。具体如下：
使用方法：
htmltotext convert = new htmltotext(); textbox2.text = convert.convert(textbox1.text);
c#代码如下：
/// <summary> /// converts html to plain text. /// </summary> class htmltotext { // static data tables protected static dictionary<string, string> _tags; protected static hashset<string> _ignoretags; // instance variables protected textbuilder _text; protected string _html; protected int _pos; // static constructor (one time only) static htmltotext() { _tags = new dictionary<string, string>(); _tags.add(address, \n); _tags.add(blockquote, \n); _tags.add(div, \n); _tags.add(dl, \n); _tags.add(fieldset, \n); _tags.add(form, \n); _tags.add(h1, \n); _tags.add(/h1, \n); _tags.add(h2, \n); _tags.add(/h2, \n); _tags.add(h3, \n); _tags.add(/h3, \n); _tags.add(h4, \n); _tags.add(/h4, \n); _tags.add(h5, \n); _tags.add(/h5, \n); _tags.add(h6, \n); _tags.add(/h6, \n); _tags.add(p, \n); _tags.add(/p, \n); _tags.add(table, \n); _tags.add(/table, \n); _tags.add(ul, \n); _tags.add(/ul, \n); _tags.add(ol, \n); _tags.add(/ol, \n); _tags.add(/li, \n); _tags.add(br, \n); _tags.add(/td, \t); _tags.add(/tr, \n); _tags.add(/pre, \n); _ignoretags = new hashset<string>(); _ignoretags.add(script); _ignoretags.add(noscript); _ignoretags.add(style); _ignoretags.add(object); } /// <summary> /// converts the given html to plain text and returns the result. /// </summary> /// <param name=html>html to be converted</param> /// <returns>resulting plain text</returns> public string convert(string html) { // initialize state variables _text = new textbuilder(); _html = html; _pos = 0; // process input while (!endoftext) { if (peek() == '<') { // html tag bool selfclosing; string tag = parsetag(out selfclosing); // handle special tag cases if (tag == body) { // discard content before <body> _text.clear(); } else if (tag == /body) { // discard content after </body> _pos = _html.length; } else if (tag == pre) { // enter preformatted mode _text.preformatted = true; eatwhitespacetonextline(); } else if (tag == /pre) { // exit preformatted mode _text.preformatted = false; } string value; if (_tags.trygetvalue(tag, out value)) _text.write(value); if (_ignoretags.contains(tag)) eatinnercontent(tag); } else if (char.iswhitespace(peek())) { // whitespace (treat all as space) _text.write(_text.preformatted ? peek() : ' '); moveahead(); } else { // other text _text.write(peek()); moveahead(); } } // return result return httputility.htmldecode(_text.tostring()); } // eats all characters that are part of the current tag // and returns information about that tag protected string parsetag(out bool selfclosing) { string tag = string.empty; selfclosing = false; if (peek() == '<') { moveahead(); // parse tag name eatwhitespace(); int start = _pos; if (peek() == '/') moveahead(); while (!endoftext && !char.iswhitespace(peek()) && peek() != '/' && peek() != '>') moveahead(); tag = _html.substring(start, _pos - start).tolower(); // parse rest of tag while (!endoftext && peek() != '>') { if (peek() == '' || peek() == '\'') eatquotedvalue(); else { if (peek() == '/') selfclosing = true; moveahead(); } } moveahead(); } return tag; } // consumes inner content from the current tag protected void eatinnercontent(string tag) { string endtag = / + tag; while (!endoftext) { if (peek() == '<') { // consume a tag bool selfclosing; if (parsetag(out selfclosing) == endtag) return; // use recursion to consume nested tags if (!selfclosing && !tag.startswith(/)) eatinnercontent(tag); } else moveahead(); } } // returns true if the current position is at the end of // the string protected bool endoftext { get { return (_pos >= _html.length); } } // safely returns the character at the current position protected char peek() { return (_pos < _html.length) ? _html[_pos] : (char)0; } // safely advances to current position to the next character protected void moveahead() { _pos = math.min(_pos + 1, _html.length); } // moves the current position to the next non-whitespace // character. protected void eatwhitespace() { while (char.iswhitespace(peek())) moveahead(); } // moves the current position to the next non-whitespace // character or the start of the next line, whichever // comes first protected void eatwhitespacetonextline() { while (char.iswhitespace(peek())) { char c = peek(); moveahead(); if (c == '\n') break; } } // moves the current position past a quoted value protected void eatquotedvalue() { char c = peek(); if (c == '' || c == '\'') { // opening quote moveahead(); // find end of value int start = _pos; _pos = _html.indexofany(new char[] { c, '\r', '\n' }, _pos); if (_pos < 0) _pos = _html.length; else moveahead(); // closing quote } } /// <summary> /// a stringbuilder class that helps eliminate excess whitespace. /// </summary> protected class textbuilder { private stringbuilder _text; private stringbuilder _currline; private int _emptylines; private bool _preformatted; // construction public textbuilder() { _text = new stringbuilder(); _currline = new stringbuilder(); _emptylines = 0; _preformatted = false; } /// <summary> /// normally, extra whitespace characters are discarded. /// if this property is set to true, they are passed /// through unchanged. /// </summary> public bool preformatted { get { return _preformatted; } set { if (value) { // clear line buffer if changing to // preformatted mode if (_currline.length > 0) flushcurrline(); _emptylines = 0; } _preformatted = value; } } /// <summary> /// clears all current text. /// </summary> public void clear() { _text.length = 0; _currline.length = 0; _emptylines = 0; } /// <summary> /// writes the given string to the output buffer. /// </summary> /// <param name=s></param> public void write(string s) { foreach (char c in s) write(c); } /// <summary> /// writes the given character to the output buffer. /// </summary> /// <param name=c>character to write</param> public void write(char c) { if (_preformatted) { // write preformatted character _text.append(c); } else { if (c == '\r') { // ignore carriage returns. we'll process // '\n' if it comes next } else if (c == '\n') { // flush current line flushcurrline(); } else if (char.iswhitespace(c)) { // write single space character int len = _currline.length; if (len == 0 || !char.iswhitespace(_currline[len - 1])) _currline.append(' '); } else { // add character to current line _currline.append(c); } } } // appends the current line to output buffer protected void flushcurrline() { // get current line string line = _currline.tostring().trim(); // determine if line contains non-space characters string tmp = line.replace( , string.empty); if (tmp.length == 0) { // an empty line _emptylines++; if (_emptylines < 2 && _text.length > 0) _text.appendline(line); } else { // a non-empty line _emptylines = 0; _text.appendline(line); } // reset current line _currline.length = 0; } /// <summary> /// returns the current output as a string. /// </summary> public override string tostring() { if (_currline.length > 0) flushcurrline(); return _text.tostring(); } } }
希望本文所述对大家的c#程序设计有所帮助。
更多c#实现将html转换成纯文本的方法。

C#实现将HTML转换成纯文本的方法

推荐信息