只能说这代码不怎样,还不如以前我C#自己研究的准确。。基本上只有对百度知道这样,内容超少的才会误抓。
基本上大新闻站,或普通博客类的,95% 以上准确。。。。
而今天翻译的这个。。。。吐血了。。。。
上代码吧。
//提取网页正文
import inet;
import inet.http;
import preg;
import console;
namespace textExtract;
class textExtract {
//var url = '';
//var rawPageCode = '';
//var textLines = {};
//var blksLen = {};
//var text = '';
//var blkSize;
//var isGB;
ctor( url,blkSize=3 ) {
this.url = url;
this.blkSize = blkSize;
string = ..string;
table = ..table;
preg = ..preg;
console = ..console;
this.textLines = {};
this.rawPageCode='';
this.blksLen = {};
this.text='';
}
getPageCode = function(){
http = ..inet.http();
this.rawPageCode = http.get(this.url);
if( string.indexOf( this.rawPageCode, "<META http-equiv=""refresh"" content=" ) ){
//console.log(" 有refresh ");
var gourl = string.match( this.rawPageCode, "\<META\s+http-equiv=""refresh""\s+content=""\d+;URL='(.+?)'""" );
if( gourl ){
//console.log(" 有refresh_url ");
this.url = gourl;
this.rawPageCode = http.get(this.url);
}
}
string.save("/get.html",this.rawPageCode )
}
procEncoding = function() {
var patt = "charset\s*=\s*""?([^"">\s]+)";
//console.log(" 有charset ");
var pp=preg(patt,"i");
var matches = pp.match( this.rawPageCode );
if( !matches || matches=="" ) matches = "utf-8";
var tmp = string.sub( matches, 1, 2 );
//console.log(" cc2222 ");
if( string.upper(tmp) != 'GB' ) {
this.isGB = false;
replacement = 'charset=GBK"';
this.rawPageCode = pp.replace( this.rawPageCode, replacement, );
} else {
this.isGB = true;
}
}
preProcess = function() {
console.log(" aa00 ");
content = this.rawPageCode;
replacement = '';
// 1. DTD information
pattern = '<!DOCTYPE.*?>';
content = preg(pattern,"si").replace( content, replacement );
//console.log(" aa11 ");
// 2. HTML comment
pattern = '<!--.*?-->';
content = preg(pattern,"si").replace( content, replacement );
//console.log(" aa22 ");
// 3. Java Script
pattern = '<script.*?>.*?<\/script>';
content = preg(pattern,"si").replace( content, replacement );
// 4. CSS
pattern = '<style.*?>.*?<\/style>';
content = preg(pattern,"si").replace( content, replacement );
// 5. HTML TAGs
pattern = '<.*?>';
content = preg(pattern,"si").replace( content, replacement );
// 6. some special charcaters
pattern = '&.{1,5};|&#.{1,5};';
content = preg(pattern,"si").replace( content, replacement );
return content;
}
getTextLines = function( rawText ) {
// do some replacement
order = { '\r\n'; '\n'; '\r'; };
replace = '\n';
rawText = string.replace( rawText, order[1], replace );
rawText = string.replace( rawText, order[2], replace );
rawText = string.replace( rawText, order[3], replace );
lines = string.split( rawText, '\n' );
for(i=1;#lines;1){
tmp = string.replace( lines[i],'\s+','' );
table.push( this.textLines, tmp );
}
}
/* Calculate the blocks' length
* @return void
*/
calBlocksLen = function() {
textLineNum = #this.textLines;
// calculate the first block's length
blkLen = 0;
for(i=1;this.blkSize;1){
blkLen += #this.textLines[i];
}
table.push( this.blksLen, blkLen );
// calculate the other block's length using Dynamic Programming method
for(i=2;textLineNum-this.blkSize;1){
blkLen = this.blksLen[i-1] + #this.textLines[i-1+this.blkSize] - #this.textLines[i-1];
table.push( this.blksLen, blkLen );
}
}
/*
* Extract the text from the web page's source code
* according to the simple idea:
* [the text should be the longgest continuous content
* in the web page]
* @return string
*/
getPlainText = function() {
this.getPageCode();
this.procEncoding();
preProcText = this.preProcess();
this.getTextLines( preProcText );
this.calBlocksLen();
starts = -1;
ends = -1;
i = 0;
maxTextLen = 0;
//console.varDump( '最后一步了' )
blkNum = #this.blksLen;
for(i=1;blkNum;1){
while( i<=blkNum && this.blksLen[i]==0 ){ i++; }
if( i>blkNum ) break;
tmp = i;
curTextLen = 0;
portion = '';
//console.varDump( '111' )
while( i<=blkNum && this.blksLen[i]!=0 ){
if( this.textLines[i] != '' ) {
portion ++= this.textLines[i];
portion ++= '<br />';
curTextLen += #this.textLines[i];
//console.varDump( '222' )
}
i++;
}
if( curTextLen > maxTextLen ) {
this.text = portion;
maxTextLen = curTextLen;
starts = tmp;
ends = i - 1;
}
}
console.varDump( this.text )
return this.text;
}
}
原理挺简单,可惜我试了十个新闻源,准确的能有3个就不错了,另外的错的一塌糊涂。。原PHP代码是10年的。。。
调用:(自己import )
var ext = textExtract.textExtract( mainForm.TB_Url.text );
var body = ext.getPlainText();
if( ext.isGB ) body = string.fromto( body, 936, 65001 );
string.save("/body.html",body )
wb.go("/body.html");
发表评论