SourceForge.net Logo GoldSeeker Data Extraction Tool

/**************************

* GOLD SEEKER
* Data Extraction Tool
**************************/


== DOCUMENTATION ==============================================================


Public Methods
--------------

    GoldSeeker::setVerbose(boolean)
    
    Tells GoldSeeker wether to display its results to the browser.



Features & samples
------------------

I. Simple value extraction
~~~~~~~~~~~~~~~~~~~~~~~~~~

    Config file:
    ************
    <<locate name="pageTitle">>
        <<begin>><title><</begin>>
        <<end>></title><</end>>
    <</locate>>
    
    Data source:
    ************
    <html>
    <head>
    <title>My sample</title>
    </head>
    </html>
    
    
    Result:
    *******
    Array
    (
        [0] => Array
            (
                [name] => pageTitle
                [instances] => Array
                    (
                        [0] => Array
                            (
                                [contents] => My sample
                                [position] => 26
                            )
    
                    )
    
            )
    
    )


II. Loop value extraction
~~~~~~~~~~~~~~~~~~~~~~~~~

    Config file:
    ************
    <<locate>>
        <<begin>><table width="100%"><</begin>>

        <<section name="firstCol,secondCol">>
            <<begin>><td><</begin>>
            <<end>></td><</end>>

            <<begin>><td><</begin>>
            <<end>></td><</end>>

            <<endofline>></tr><</endofline>>
        <</section>>

        <<end>></table><</end>>
    <</locate>>
    
    Data source:
    ************
    <html>
    <body>
    <table>
        <tr>
            <td>I don't want the data from this table</td>
            <td></td>
        </tr>
    </table>
    <table width="100%">
        <tr>
            <td>This is the data I want</td>
            <td>This too</td>
            <td>Not this</td>
        </tr>
        <tr>
            <td>I want...</td>
            <td>I want...</td>
            <td>I don't want</td>
        </tr>
    </table>
    </body>
    </html>
    
    
    Result:
    *******
    Array
    (
        [0] => Array
            (
                [name] => firstCol
                [instances] => Array
                    (
                        [0] => Array
                            (
                                [contents] => This is the data I want
                                [position] => 159
                            )
    
                        [1] => Array
                            (
                                [contents] => I want...
                                [position] => 276
                            )
    
                    )
    
            )
    
        [1] => Array
            (
                [name] => secondCol
                [instances] => Array
                    (
                        [0] => Array
                            (
                                [contents] => This too
                                [position] => 209
                            )
    
                        [1] => Array
                            (
                                [contents] => I want...
                                [position] => 313
                            )
    
                    )
    
            )
    
    )


III. Startsafter feature
~~~~~~~~~~~~~~~~~~~~~~~~

    Config file:
    ************
    <<locate name="dummy">>
        <<begin>>John Doe<</begin>>
        <<end>><a<</end>>
    <</locate>>
    <<locate name="path" startsafter=$dummy>>
        <<begin>>href="mailto:<</begin>>
        <<end>>"<</end>>
    <</locate>>
    <<locate name="age" startsafter=$path>>
        <<begin>>(<</begin>>
        <<end>>)<</end>>
    <</locate>>

    
    Data source:
    ************
    <html>
    <body>
    Members
    <br><br>
    <br>Julian Ray <a href="mailto:jray@dummy.com"> (18)
    <br>Samantha Williams <a href="mailto:swill@dummy.com"> (33)
    <br>John Doe <a href="mailto:jdoe@dummy.com"> (21)
    <br>Max Dammis <a href="mailto:mdamm@dummy.com"> (17)
    </body>
    </html>
    
    
    Result:
    *******
    Array
    (
        [0] => Array
            (
                [name] => dummy
                [instances] => Array
                    (
                        [0] => Array
                            (
                                [contents] =>  
                                [position] => 169
                            )
    
                    )
    
            )
    
        [1] => Array
            (
                [name] => path
                [instances] => Array
                    (
                        [0] => Array
                            (
                                [contents] => jdoe@dummy.com
                                [position] => 186
                            )
    
                    )
    
            )
    
        [2] => Array
            (
                [name] => age
                [instances] => Array
                    (
                        [0] => Array
                            (
                                [contents] => 21
                                [position] => 204
                            )
    
                    )
    
            )
    
    )


IV. Crawling feature & database export
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    Config files:
    *************
    sample1.gs:
    -----------
    <<locate>>
    
        <<section name="urlPage,nomPage">>
    
            <<begin>>p class=g><a href=<</begin>>
            <<end>>><</end>>
    
            <<begin>><</begin>>
            <<end>></a><</end>>
    
        <</section>>
    
    <</locate>>
    
    
    <<action crawl url="$urlPage" configfile="./sample2.gs">>

    
    sample2.gs:
    <<locate name="title">>
    
        <<begin>><title><</begin>>
        <<end>></title><</end>>
    
    <</locate>>
    
    <<action insert table="googleTitles" fields="title" values="$title"/>>
    
    Data source:
    ************
    http://www.google.fr/search?sourceid=navclient&hl=fr&ie=UTF-8&oe=UTF-8&q=test
    
    
    Result:
    *******
    Fills in the googleTitles table.


V. Nested section / <<or>> example
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    Config file:
    ************
    <<locate>>
        <<begin>><table><</begin>>
        
        <<section name="productName,productPrice,dummy">>
            <<begin>><td><</begin>>
            <<end>>$<</end>>
        
            <<begin>><</begin>>
            <<end>></td><</end>>

            <<begin>>Characteristics<</begin>>
            
            <<section name="productCharacteristics">>
            
                <<begin>><tr><td><</begin>>
                <<end>></td></tr><</end>>

            <</section>>
            
            <<end>></table><</end>>
        
            <<endofline>>c_grey<<or>>c_red<</endofline>>
        
        <</section>>
        
        <<end>></html><</end>>
    <</locate>>
    
    Data source:
    ************
    <html>
    <body>
    
    <table>
        <tr>
            <td colspan=10>Best deals</td>
        </tr>
        
        <tr class="c_grey">
            <td>1. Logetich XM 007 $40</td>
            <td>instead of $90</td>
            <td>
                Characteristics:<br>
                <table>
                    <tr><td>Triple optical lense</td></tr>
                    <tr><td>5610654000 dpi precision</td></tr>
                    <tr><td>Cool shape</td></tr>
                    </tr>
                </table>
            </td>
        </tr>
        <tr class="c_white">
            <td>Get this great price with code "65ds4f0"</td>
        </tr>
        <tr class="c_red">
            <td>2. Mocrisoft Zindoz XXX $120</td>
            <td>instead of $200</td>
            <td>
                Characteristics:<br>
                <table>
                    <tr><td>Over 12000 bugs featured</td></tr>
                    <tr><td>Easy to use</td></tr>
                    <tr><td>Complete boxed set</td></tr>
                </table>
            </td>
        </tr>
        <tr class="c_white">
            <td>Get this great price with code "551654sdf"</td>
        </tr>
        <tr class="c_grey">
            <td>3. Virbatem CD-x 900min. pk50 $10</td>
            <td>instead of $20</td>
            <td>
                Characteristics:<br>
                <table>
                    <tr><td>Those are CDs</td></tr>
                    <tr><td>There are 50 of them</td></tr>
                    <tr><td>You can record 90min on each</td></tr>
                    <tr><td>And they're cool</td></tr>
                </table>
            </td>
        </tr>
        <tr class="c_white">
            <td>Get this great price with code "sdf654e"</td>
        </tr>
    </table>
    
    </body>
    </html>
    
    
    Result:
    *******
    
    Array
    (
        [0] => Array
            (
                [name] => productName
                [instances] => Array
                    (
                        [0] => Array
                            (
                                [contents] => 1. Logetich XM 007
                                [position] => 58
                            )
    
                        [1] => Array
                            (
                                [contents] => 2. Mocrisoft Zindoz XXX
                                [position] => 828
                            )
    
                        [2] => Array
                            (
                                [contents] => 3. Virbatem CD-x 900min. pk50
                                [position] => 1593
                            )
    
                    )
    
            )
    
        [1] => Array
            (
                [name] => productPrice
                [instances] => Array
                    (
                        [0] => Array
                            (
                                [contents] => 40
                                [position] => 78
                            )
    
                        [1] => Array
                            (
                                [contents] => 120
                                [position] => 853
                            )
    
                        [2] => Array
                            (
                                [contents] => 10
                                [position] => 1624
                            )
    
                    )
    
            )
    
        [2] => Array
            (
                [name] => dummy
                [instances] => Array
                    (
                        [0] => Array
                            (
                                [contents] => :<br>
                    <table>
                        <tr><td>Triple optical lense</td></tr>
                        <tr><td>5610654000 dpi precision</td></tr>
                        <tr><td>Cool shape</td></tr>
                        </tr>
                    
                                [position] => 487
                                [subvars] => Array
                                    (
                                        [0] => Array
                                            (
                                                [name] => productCharacteristics
                                                [instances] => Array
                                                    (
                                                        [0] => Array
                                                            (
                                                                [contents] => Triple optical lense
                                                                [position] => 45
                                                            )
    
                                                        [1] => Array
                                                            (
                                                                [contents] => 5610654000 dpi precision
                                                                [position] => 124
                                                            )
    
                                                        [2] => Array
                                                            (
                                                                [contents] => Cool shape
                                                                [position] => 166
                                                            )
    
                                                    )
    
                                            )
    
                                    )
    
                            )
    
                        [1] => Array
                            (
                                [contents] => :<br>
                    <table>
                        <tr><td>Over 12000 bugs featured</td></tr>
                        <tr><td>Easy to use</td></tr>
                        <tr><td>Complete boxed set</td></tr>
                    
                                [position] => 1257
                                [subvars] => Array
                                    (
                                        [0] => Array
                                            (
                                                [name] => productCharacteristics
                                                [instances] => Array
                                                    (
                                                        [0] => Array
                                                            (
                                                                [contents] => Over 12000 bugs featured
                                                                [position] => 49
                                                            )
    
                                                        [1] => Array
                                                            (
                                                                [contents] => Easy to use
                                                                [position] => 119
                                                            )
    
                                                        [2] => Array
                                                            (
                                                                [contents] => Complete boxed set
                                                                [position] => 148
                                                            )
    
                                                    )
    
                                            )
    
                                    )
    
                            )
    
                        [2] => Array
                            (
                                [contents] => :<br>
                    <table>
                        <tr><td>Those are CDs</td></tr>
                        <tr><td>There are 50 of them</td></tr>
                        <tr><td>You can record 90min on each</td></tr>
                        <tr><td>And they're cool</td></tr>
                    
                                [position] => 1646
                                [subvars] => Array
                                    (
                                        [0] => Array
                                            (
                                                [name] => productCharacteristics
                                                [instances] => Array
                                                    (
                                                        [0] => Array
                                                            (
                                                                [contents] => Those are CDs
                                                                [position] => 38
                                                            )
    
                                                        [1] => Array
                                                            (
                                                                [contents] => There are 50 of them
                                                                [position] => 106
                                                            )
    
                                                        [2] => Array
                                                            (
                                                                [contents] => You can record 90min on each
                                                                [position] => 189
                                                            )
    
                                                        [3] => Array
                                                            (
                                                                [contents] => And they're cool
                                                                [position] => 235
                                                            )
    
                                                    )
    
                                            )
    
                                    )
    
                            )
    
                    )
    
            )
    
    )