获取网页正文,提取网页正文,用aardio翻译的php类库来实现的

只能说这代码不怎样，还不如以前我C#自己研究的准确。。基本上只有对百度知道这样，内容超少的才会误抓。

基本上大新闻站，或普通博客类的，95% 以上准确。。。。

而今天翻译的这个。。。。吐血了。。。。

上代码吧。

//提取网页正文
import inet;
import inet.http; 
import preg;
import console;
namespace textExtract;

class textExtract {

    //var  url = '';
    //var  rawPageCode  = '';
    //var  textLines    = {};
    //var  blksLen      = {};
    //var  text         = '';
    //var  blkSize;
    //var  isGB;

    ctor( url,blkSize=3 ) {  
        this.url = url;
        this.blkSize = blkSize;
        string = ..string;
        table = ..table;
        preg = ..preg;
        console = ..console;
        this.textLines = {};
        this.rawPageCode='';
        this.blksLen = {};
        this.text='';
    }
    getPageCode = function(){   
        http = ..inet.http();
        this.rawPageCode = http.get(this.url);
        if( string.indexOf( this.rawPageCode, "<META http-equiv=""refresh"" content=" ) ){
            //console.log(" 有refresh ");
            var gourl = string.match( this.rawPageCode, "\<META\s+http-equiv=""refresh""\s+content=""\d+;URL='(.+?)'""" );
            if( gourl ){
                //console.log(" 有refresh_url ");
                this.url = gourl;
                this.rawPageCode = http.get(this.url);
            }
        }
        string.save("/get.html",this.rawPageCode )
    }

    procEncoding = function() {

        var patt = "charset\s*=\s*""?([^"">\s]+)";
        //console.log(" 有charset ");
        var pp=preg(patt,"i");
        var matches = pp.match( this.rawPageCode );
        if( !matches || matches=="" ) matches = "utf-8";
        var tmp = string.sub( matches, 1, 2 );
        //console.log(" cc2222 ");

        if( string.upper(tmp) != 'GB' ) {
            this.isGB = false;
            replacement = 'charset=GBK"';
            this.rawPageCode = pp.replace( this.rawPageCode, replacement,  );
        } else {
            this.isGB = true;
        }
    }

    preProcess = function() {
        console.log(" aa00 ");
        content = this.rawPageCode;
        replacement = '';
        // 1. DTD information
        pattern = '<!DOCTYPE.*?>';
        content = preg(pattern,"si").replace( content, replacement );
        //console.log(" aa11 ");
        // 2. HTML comment
        pattern = '<!--.*?-->';
        content =  preg(pattern,"si").replace( content, replacement );
        //console.log(" aa22 ");
        // 3. Java Script
        pattern = '<script.*?>.*?<\/script>';
        content =  preg(pattern,"si").replace( content, replacement );

        // 4. CSS
        pattern = '<style.*?>.*?<\/style>';
        content =  preg(pattern,"si").replace( content, replacement );

        // 5. HTML TAGs
        pattern = '<.*?>';
        content =  preg(pattern,"si").replace( content, replacement );

        // 6. some special charcaters
        pattern = '&.{1,5};|&#.{1,5};';
        content =  preg(pattern,"si").replace( content, replacement );

        return content;
    }

    getTextLines = function( rawText ) {
        // do some replacement
        order = { '\r\n'; '\n'; '\r'; };
        replace = '\n';
        rawText = string.replace( rawText, order[1], replace );
        rawText = string.replace( rawText, order[2], replace );
        rawText = string.replace( rawText, order[3], replace );

        lines = string.split( rawText, '\n' );

        for(i=1;#lines;1){
            tmp = string.replace( lines[i],'\s+','' );
            table.push( this.textLines, tmp );
        }
    }
    /* Calculate the blocks' length
     * @return void
    */
    calBlocksLen = function() {
        textLineNum = #this.textLines;
        // calculate the first block's length
        blkLen = 0;
        for(i=1;this.blkSize;1){
            blkLen += #this.textLines[i];

        }
        table.push( this.blksLen, blkLen );

        // calculate the other block's length using Dynamic Programming method
        for(i=2;textLineNum-this.blkSize;1){
            blkLen = this.blksLen[i-1] +  #this.textLines[i-1+this.blkSize] - #this.textLines[i-1];
            table.push( this.blksLen, blkLen );
        }
    }
    /*
     * Extract the text from the web page's source code
     * according to the simple idea:
     * [the text should be the longgest continuous content
     * in the web page]
     * @return string
    */
    getPlainText = function() {
        this.getPageCode();
        this.procEncoding();
        preProcText = this.preProcess();
        this.getTextLines( preProcText );
        this.calBlocksLen();

        starts = -1;
        ends = -1;
        i = 0;
        maxTextLen = 0;
        //console.varDump( '最后一步了' )
        blkNum = #this.blksLen;
        for(i=1;blkNum;1){
            while( i<=blkNum && this.blksLen[i]==0 ){ i++; }
            if( i>blkNum ) break;
            tmp = i;

            curTextLen = 0;
            portion = '';
            //console.varDump( '111' )
            while( i<=blkNum && this.blksLen[i]!=0 ){
                 if( this.textLines[i] != '' ) {
                    portion ++= this.textLines[i];
                    portion ++= '<br />';
                    curTextLen += #this.textLines[i];
                    //console.varDump( '222' )
                }
                i++;
            }
            if( curTextLen > maxTextLen ) {
                this.text = portion;
                maxTextLen = curTextLen;
                starts = tmp;
                ends = i - 1;

            }
        }
        console.varDump( this.text )
        return this.text;
    }

}

原理挺简单，可惜我试了十个新闻源，准确的能有３个就不错了，另外的错的一塌糊涂。。原ＰＨＰ代码是１０年的。。。

调用：（自己import )

var ext = textExtract.textExtract( mainForm.TB_Url.text );
    var body = ext.getPlainText();
    if( ext.isGB ) body = string.fromto( body, 936, 65001 );
    string.save("/body.html",body )
    wb.go("/body.html");

本博客所有文章如无特别注明均为原创。作者：恶猫，复制或转载请以超链接形式注明转自恶猫的博客。
原文地址《获取网页正文,提取网页正文,用aardio翻译的php类库来实现的》