/**************************
* GOLD SEEKER
* Data Extraction Tool
**************************/
== DOCUMENTATION ==============================================================
Public Methods
--------------
GoldSeeker::setVerbose(boolean)
Tells GoldSeeker wether to display its results to the browser.
Features & samples
------------------
I. Simple value extraction
~~~~~~~~~~~~~~~~~~~~~~~~~~
Config file:
************
<<locate name="pageTitle">>
<<begin>><title><</begin>>
<<end>></title><</end>>
<</locate>>
Data source:
************
<html>
<head>
<title>My sample</title>
</head>
</html>
Result:
*******
Array
(
[0] => Array
(
[name] => pageTitle
[instances] => Array
(
[0] => Array
(
[contents] => My sample
[position] => 26
)
)
)
)
II. Loop value extraction
~~~~~~~~~~~~~~~~~~~~~~~~~
Config file:
************
<<locate>>
<<begin>><table width="100%"><</begin>>
<<section name="firstCol,secondCol">>
<<begin>><td><</begin>>
<<end>></td><</end>>
<<begin>><td><</begin>>
<<end>></td><</end>>
<<endofline>></tr><</endofline>>
<</section>>
<<end>></table><</end>>
<</locate>>
Data source:
************
<html>
<body>
<table>
<tr>
<td>I don't want the data from this table</td>
<td></td>
</tr>
</table>
<table width="100%">
<tr>
<td>This is the data I want</td>
<td>This too</td>
<td>Not this</td>
</tr>
<tr>
<td>I want...</td>
<td>I want...</td>
<td>I don't want</td>
</tr>
</table>
</body>
</html>
Result:
*******
Array
(
[0] => Array
(
[name] => firstCol
[instances] => Array
(
[0] => Array
(
[contents] => This is the data I want
[position] => 159
)
[1] => Array
(
[contents] => I want...
[position] => 276
)
)
)
[1] => Array
(
[name] => secondCol
[instances] => Array
(
[0] => Array
(
[contents] => This too
[position] => 209
)
[1] => Array
(
[contents] => I want...
[position] => 313
)
)
)
)
III. Startsafter feature
~~~~~~~~~~~~~~~~~~~~~~~~
Config file:
************
<<locate name="dummy">>
<<begin>>John Doe<</begin>>
<<end>><a<</end>>
<</locate>>
<<locate name="path" startsafter=$dummy>>
<<begin>>href="mailto:<</begin>>
<<end>>"<</end>>
<</locate>>
<<locate name="age" startsafter=$path>>
<<begin>>(<</begin>>
<<end>>)<</end>>
<</locate>>
Data source:
************
<html>
<body>
Members
<br><br>
<br>Julian Ray <a href="mailto:jray@dummy.com"> (18)
<br>Samantha Williams <a href="mailto:swill@dummy.com"> (33)
<br>John Doe <a href="mailto:jdoe@dummy.com"> (21)
<br>Max Dammis <a href="mailto:mdamm@dummy.com"> (17)
</body>
</html>
Result:
*******
Array
(
[0] => Array
(
[name] => dummy
[instances] => Array
(
[0] => Array
(
[contents] =>
[position] => 169
)
)
)
[1] => Array
(
[name] => path
[instances] => Array
(
[0] => Array
(
[contents] => jdoe@dummy.com
[position] => 186
)
)
)
[2] => Array
(
[name] => age
[instances] => Array
(
[0] => Array
(
[contents] => 21
[position] => 204
)
)
)
)
IV. Crawling feature & database export
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Config files:
*************
sample1.gs:
-----------
<<locate>>
<<section name="urlPage,nomPage">>
<<begin>>p class=g><a href=<</begin>>
<<end>>><</end>>
<<begin>><</begin>>
<<end>></a><</end>>
<</section>>
<</locate>>
<<action crawl url="$urlPage" configfile="./sample2.gs">>
sample2.gs:
<<locate name="title">>
<<begin>><title><</begin>>
<<end>></title><</end>>
<</locate>>
<<action insert table="googleTitles" fields="title" values="$title"/>>
Data source:
************
http://www.google.fr/search?sourceid=navclient&hl=fr&ie=UTF-8&oe=UTF-8&q=test
Result:
*******
Fills in the googleTitles table.
V. Nested section / <<or>> example
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Config file:
************
<<locate>>
<<begin>><table><</begin>>
<<section name="productName,productPrice,dummy">>
<<begin>><td><</begin>>
<<end>>$<</end>>
<<begin>><</begin>>
<<end>></td><</end>>
<<begin>>Characteristics<</begin>>
<<section name="productCharacteristics">>
<<begin>><tr><td><</begin>>
<<end>></td></tr><</end>>
<</section>>
<<end>></table><</end>>
<<endofline>>c_grey<<or>>c_red<</endofline>>
<</section>>
<<end>></html><</end>>
<</locate>>
Data source:
************
<html>
<body>
<table>
<tr>
<td colspan=10>Best deals</td>
</tr>
<tr class="c_grey">
<td>1. Logetich XM 007 $40</td>
<td>instead of $90</td>
<td>
Characteristics:<br>
<table>
<tr><td>Triple optical lense</td></tr>
<tr><td>5610654000 dpi precision</td></tr>
<tr><td>Cool shape</td></tr>
</tr>
</table>
</td>
</tr>
<tr class="c_white">
<td>Get this great price with code "65ds4f0"</td>
</tr>
<tr class="c_red">
<td>2. Mocrisoft Zindoz XXX $120</td>
<td>instead of $200</td>
<td>
Characteristics:<br>
<table>
<tr><td>Over 12000 bugs featured</td></tr>
<tr><td>Easy to use</td></tr>
<tr><td>Complete boxed set</td></tr>
</table>
</td>
</tr>
<tr class="c_white">
<td>Get this great price with code "551654sdf"</td>
</tr>
<tr class="c_grey">
<td>3. Virbatem CD-x 900min. pk50 $10</td>
<td>instead of $20</td>
<td>
Characteristics:<br>
<table>
<tr><td>Those are CDs</td></tr>
<tr><td>There are 50 of them</td></tr>
<tr><td>You can record 90min on each</td></tr>
<tr><td>And they're cool</td></tr>
</table>
</td>
</tr>
<tr class="c_white">
<td>Get this great price with code "sdf654e"</td>
</tr>
</table>
</body>
</html>
Result:
*******
Array
(
[0] => Array
(
[name] => productName
[instances] => Array
(
[0] => Array
(
[contents] => 1. Logetich XM 007
[position] => 58
)
[1] => Array
(
[contents] => 2. Mocrisoft Zindoz XXX
[position] => 828
)
[2] => Array
(
[contents] => 3. Virbatem CD-x 900min. pk50
[position] => 1593
)
)
)
[1] => Array
(
[name] => productPrice
[instances] => Array
(
[0] => Array
(
[contents] => 40
[position] => 78
)
[1] => Array
(
[contents] => 120
[position] => 853
)
[2] => Array
(
[contents] => 10
[position] => 1624
)
)
)
[2] => Array
(
[name] => dummy
[instances] => Array
(
[0] => Array
(
[contents] => :<br>
<table>
<tr><td>Triple optical lense</td></tr>
<tr><td>5610654000 dpi precision</td></tr>
<tr><td>Cool shape</td></tr>
</tr>
[position] => 487
[subvars] => Array
(
[0] => Array
(
[name] => productCharacteristics
[instances] => Array
(
[0] => Array
(
[contents] => Triple optical lense
[position] => 45
)
[1] => Array
(
[contents] => 5610654000 dpi precision
[position] => 124
)
[2] => Array
(
[contents] => Cool shape
[position] => 166
)
)
)
)
)
[1] => Array
(
[contents] => :<br>
<table>
<tr><td>Over 12000 bugs featured</td></tr>
<tr><td>Easy to use</td></tr>
<tr><td>Complete boxed set</td></tr>
[position] => 1257
[subvars] => Array
(
[0] => Array
(
[name] => productCharacteristics
[instances] => Array
(
[0] => Array
(
[contents] => Over 12000 bugs featured
[position] => 49
)
[1] => Array
(
[contents] => Easy to use
[position] => 119
)
[2] => Array
(
[contents] => Complete boxed set
[position] => 148
)
)
)
)
)
[2] => Array
(
[contents] => :<br>
<table>
<tr><td>Those are CDs</td></tr>
<tr><td>There are 50 of them</td></tr>
<tr><td>You can record 90min on each</td></tr>
<tr><td>And they're cool</td></tr>
[position] => 1646
[subvars] => Array
(
[0] => Array
(
[name] => productCharacteristics
[instances] => Array
(
[0] => Array
(
[contents] => Those are CDs
[position] => 38
)
[1] => Array
(
[contents] => There are 50 of them
[position] => 106
)
[2] => Array
(
[contents] => You can record 90min on each
[position] => 189
)
[3] => Array
(
[contents] => And they're cool
[position] => 235
)
)
)
)
)
)
)
)
|