如何提取html的正文以及保留某些<>内容？

正文提取就是去除掉html代码里的a8093152e673feb7aba1828c43532094的内容。这段代码增加了可选择保留某些a8093152e673feb7aba1828c43532094内容。
1 using system; 2 using system.text; 3 namespace htmlstrip 4 { 5 class mainclass 6 { 7 public static void main (string[] args) 8 { 9 string str = "<p>abc</p><span>efg</span><br /><script>888</script>oo"; 10 //system.io.streamreader rd=new system.io.streamreader ("/home/lx/test.html"); 11 //str=rd.readtoend (); 12 htmlparser t = new htmlparser (str); // 13 t.keeptag (new string[] { "br" }); //设置br标签不过虑 14 console.write (t.text ()); 15 } 16 17 18 19 } 20 class htmlparser 21 { 22 private string[] htmlcode; //把html转为数组形式用于分析 23 private stringbuilder result = new stringbuilder (); //输出的结果 24 private int seek; //分析文本时候的指针位置 25 private string[] keeptag; //用于保存要保留的尖括号内容 26 private bool _intag; //标记现在的指针是不是在尖括号内 27 private bool needcontent = true; //是否要提取正文 28 private string tagname; //当前尖括号的名字 29 private string[] specialtag = new string[] { "script", "style", "!--" }; //特殊的尖括号内容，一般这些标签的正文是不要的 30 31 /// <summary> 32 /// 当指针进入尖括号内，就会触发这个属性。这里主要逻辑是提取尖括号里的标签名字 33 /// </summary> 34 public bool intag { 35 get { return _intag; } 36 set { 37 _intag = value; 38 if (!value) 39 return; 40 bool ok = true; 41 tagname = ""; 42 while (ok) { 43 string word = read (); 44 if (word != " " && word != ">") { 45 tagname += word; 46 } else if (word == " " && tagname.length > 0) { 47 ok = false; 48 } else if (word == ">") { 49 ok = false; 50 intag = false; 51 seek -= 1; 52 } 53 } 54 } 55 } 56 /// <summary> 57 /// 初始化类 58 /// </summary> 59 /// <param name="html"> 60 /// 要分析的html代码 61 /// </param> 62 public htmlparser (string html) 63 { 64 htmlcode = new string[html.length]; 65 for (int i = 0; i < html.length; i++) { 66 htmlcode[i] = html[i].tostring (); 67 } 68 keeptag (new string[] { }); 69 } 70 /// <summary> 71 /// 设置要保存那些标签不要被过滤掉 72 /// </summary> 73 /// <param name="tags"> 74 /// 75 /// </param> 76 public void keeptag (string[] tags) 77 { 78 keeptag = tags; 79 } 80 81 /// <summary> 82 /// 83 /// </summary> 84 /// <returns> 85 /// 输出处理后的文本 86 /// </returns> 87 public string text () 88 { 89 int starttag = 0; 90 int endtag = 0; 91 while (seek < htmlcode.length) { 92 string word = read (); 93 if (word.tolower () == "<") { 94 starttag = seek; 95 intag = true; 96 } else if (word.tolower () == ">") { 97 endtag = seek; 98 intag = false; 99 if (iskeeptag (tagname.replace ("/", ""))) { 100 for (int i = starttag - 1; i < endtag; i++) { 101 result.append (htmlcode[i].tostring ()); 102 } 103 } else if (tagname.startswith ("!--")) { 104 bool ok = true; 105 while (ok) { 106 if (read () == "-") { 107 if (read () == "-") { 108 if (read () == ">") { 109 ok = false; 110 } else { 111 seek -= 1; 112 } 113 } 114 } 115 } 116 } else { 117 foreach (string str in specialtag) { 118 if (tagname == str) { 119 needcontent = false; 120 break; 121 } else 122 needcontent = true; 123 } 124 } 125 } else if (!intag && needcontent) { 126 result.append (word); 127 } 128 129 } 130 return result.tostring (); 131 } 132 /// <summary> 133 /// 判断是否要保存这个标签 134 /// </summary> 135 /// <param name="tag"> 136 /// a <see cref="system.string"/> 137 /// </param> 138 /// <returns> 139 /// a <see cref="system.boolean"/> 140 /// </returns> 141 private bool iskeeptag (string tag) 142 { 143 foreach (string ta in keeptag) { 144 if (tag.tolower () == ta.tolower ()) { 145 return true; 146 } 147 } 148 return false; 149 } 150 private string read () 151 { 152 return htmlcode[seek++]; 153 } 154 155 } 156 } 157
以上就是如何提取html的正文以及保留某些<>内容？的详细内容。

如何提取html的正文以及保留某些<>内容？

推荐信息