只能说这代码不怎样,还不如以前我C#自己研究的准确。。基本上只有对百度知道这样,内容超少的才会误抓。
基本上大新闻站,或普通博客类的,95% 以上准确。。。。
而今天翻译的这个。。。。吐血了。。。。
上代码吧。
//提取网页正文 import inet; import inet.http; import preg; import console; namespace textExtract; class textExtract { //var url = ''; //var rawPageCode = ''; //var textLines = {}; //var blksLen = {}; //var text = ''; //var blkSize; //var isGB; ctor( url,blkSize=3 ) { this.url = url; this.blkSize = blkSize; string = ..string; table = ..table; preg = ..preg; console = ..console; this.textLines = {}; this.rawPageCode=''; this.blksLen = {}; this.text=''; } getPageCode = function(){ http = ..inet.http(); this.rawPageCode = http.get(this.url); if( string.indexOf( this.rawPageCode, "<META http-equiv=""refresh"" content=" ) ){ //console.log(" 有refresh "); var gourl = string.match( this.rawPageCode, "\<META\s+http-equiv=""refresh""\s+content=""\d+;URL='(.+?)'""" ); if( gourl ){ //console.log(" 有refresh_url "); this.url = gourl; this.rawPageCode = http.get(this.url); } } string.save("/get.html",this.rawPageCode ) } procEncoding = function() { var patt = "charset\s*=\s*""?([^"">\s]+)"; //console.log(" 有charset "); var pp=preg(patt,"i"); var matches = pp.match( this.rawPageCode ); if( !matches || matches=="" ) matches = "utf-8"; var tmp = string.sub( matches, 1, 2 ); //console.log(" cc2222 "); if( string.upper(tmp) != 'GB' ) { this.isGB = false; replacement = 'charset=GBK"'; this.rawPageCode = pp.replace( this.rawPageCode, replacement, ); } else { this.isGB = true; } } preProcess = function() { console.log(" aa00 "); content = this.rawPageCode; replacement = ''; // 1. DTD information pattern = '<!DOCTYPE.*?>'; content = preg(pattern,"si").replace( content, replacement ); //console.log(" aa11 "); // 2. HTML comment pattern = '<!--.*?-->'; content = preg(pattern,"si").replace( content, replacement ); //console.log(" aa22 "); // 3. Java Script pattern = '<script.*?>.*?<\/script>'; content = preg(pattern,"si").replace( content, replacement ); // 4. CSS pattern = '<style.*?>.*?<\/style>'; content = preg(pattern,"si").replace( content, replacement ); // 5. HTML TAGs pattern = '<.*?>'; content = preg(pattern,"si").replace( content, replacement ); // 6. some special charcaters pattern = '&.{1,5};|&#.{1,5};'; content = preg(pattern,"si").replace( content, replacement ); return content; } getTextLines = function( rawText ) { // do some replacement order = { '\r\n'; '\n'; '\r'; }; replace = '\n'; rawText = string.replace( rawText, order[1], replace ); rawText = string.replace( rawText, order[2], replace ); rawText = string.replace( rawText, order[3], replace ); lines = string.split( rawText, '\n' ); for(i=1;#lines;1){ tmp = string.replace( lines[i],'\s+','' ); table.push( this.textLines, tmp ); } } /* Calculate the blocks' length * @return void */ calBlocksLen = function() { textLineNum = #this.textLines; // calculate the first block's length blkLen = 0; for(i=1;this.blkSize;1){ blkLen += #this.textLines[i]; } table.push( this.blksLen, blkLen ); // calculate the other block's length using Dynamic Programming method for(i=2;textLineNum-this.blkSize;1){ blkLen = this.blksLen[i-1] + #this.textLines[i-1+this.blkSize] - #this.textLines[i-1]; table.push( this.blksLen, blkLen ); } } /* * Extract the text from the web page's source code * according to the simple idea: * [the text should be the longgest continuous content * in the web page] * @return string */ getPlainText = function() { this.getPageCode(); this.procEncoding(); preProcText = this.preProcess(); this.getTextLines( preProcText ); this.calBlocksLen(); starts = -1; ends = -1; i = 0; maxTextLen = 0; //console.varDump( '最后一步了' ) blkNum = #this.blksLen; for(i=1;blkNum;1){ while( i<=blkNum && this.blksLen[i]==0 ){ i++; } if( i>blkNum ) break; tmp = i; curTextLen = 0; portion = ''; //console.varDump( '111' ) while( i<=blkNum && this.blksLen[i]!=0 ){ if( this.textLines[i] != '' ) { portion ++= this.textLines[i]; portion ++= '<br />'; curTextLen += #this.textLines[i]; //console.varDump( '222' ) } i++; } if( curTextLen > maxTextLen ) { this.text = portion; maxTextLen = curTextLen; starts = tmp; ends = i - 1; } } console.varDump( this.text ) return this.text; } }
原理挺简单,可惜我试了十个新闻源,准确的能有3个就不错了,另外的错的一塌糊涂。。原PHP代码是10年的。。。
调用:(自己import )
var ext = textExtract.textExtract( mainForm.TB_Url.text ); var body = ext.getPlainText(); if( ext.isGB ) body = string.fromto( body, 936, 65001 ); string.save("/body.html",body ) wb.go("/body.html");
发表评论