INTRODUCTION:
Retrieving a webpage and extracting the required data ("parsing") is a very common need, but sometimes it is dead easy and sometimes it is quite hard, due to the page layout:
This snippet acts as a sort of 'script-interface', providing one single approach that handles all these problems 'internally', saving the calling script from all this.
USAGE (or move directly to the example further on):
1) You call WR_Start with parameters:
NOTE: CID and RID cannot have underscores _ or wildcard symbols * and ?
2) When it has done, it does a /signal WR_ with parameters:
3) Now you can call WR_Filter with parameters
If no tags specified it will filter the entire content within the given start/stop positions
This alias returns following parameters after tokenize 32 $result:
EDIT 2007/02/04:
Changed to work with given http headers.
Example changed to suit this.
; === WWW PAGE RETRIEVER USAGE EXAMPLE ===
; This example inputs to a AI bot and returns its answer in status window.
; Usage: /alice sometexthere
alias Alice {
set %Alice_input $1-
var %cid = pandora, %rid = Alice1
var %c = WR_Header ADD %cid %rid
%c GET /pandora/talk?botid=b9b96b247e34f4f2 HTTP/1.1
%c Host: www.pandorabots.com
%c User-Agent: Mozilla/5.0 (Windows; U; Win98; en-US; rv:1.6) Gecko/20040206 Firefox/0.8
%c Accept: application/x-shockwave-flash,text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,video/x-mng,image/png,image/jpeg,image/gif;q=0.2,*/*;q=0.1
%c Accept-Language: en-us,en;q=0.5
%c Accept-Encoding: gzip,deflate
%c Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7
%c Keep-Alive: 300
%c Connection: keep-alive
%c $crlf
WR_Start %cid %rid www.pandorabots.com
}
alias Alice2 {
var %cid = pandora, %rid = Alice2
var %content = $+($1-,&message=,%Alice_input), %content = $replace(%content,$chr(32),+)
var %c = WR_Header ADD %cid %rid
%c POST /pandora/talk?botid=b9b96b247e34f4f2 HTTP/1.1
%c Host: www.pandorabots.com
%c User-Agent: Mozilla/5.0 (Windows; U; Win98; en-US; rv:1.6) Gecko/20040206 Firefox/0.8
%c Accept: application/x-shockwave-flash,text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,video/x-mng,image/png,image/jpeg,image/gif;q=0.2,*/*;q=0.1
%c Accept-Language: en-us,en;q=0.5
%c Accept-Encoding: gzip,deflate
%c Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7
%c Connection: Close
%c Referer: http://www.pandorabots.com/pandora/talk?botid=b9b96b247e34f4f2
%c Content-Type: application/x-www-form-urlencoded
%c Content-Length: $len(%content)
%c $+($crlf,%content)
WR_Start %cid %rid www.pandorabots.com
}
on *:SIGNAL:WR_: {
var %result = $1, %cid = $2, %rid = $3, %msg = $4-
var %j = $+(%cid,_,%rid) | goto %j
:pandora_Alice1
if (%result == FAIL) { .signal ALICE FAIL 1 | return }
WR_Filter 500 %cid %rid 1 0 32 $+(Set-Cookie:,$chr(32),;)
if ($istok(NOPAGE NOTFOUND,$result,32)) { .signal ALICE FAIL 2 | return }
tokenize 32 $result | var %w = $1, %datastart = $2, %dataend = $3, %datasize = $4, %newstart = $5
var %l = $fline(%w,botcust2=*,1,0) | if (!%l) { .signal ALICE FAIL 3 | return }
Alice2 $line(%w,%l) | window -c %w | WR_Remove %cid %rid | return
:pandora_Alice2
if (%result == FAIL) { .signal Alice FAIL 4 | return }
WR_Filter 500 %cid %rid 1 0 58 $+(<b>,:,<br> <br>)
if ($istok(NOPAGE NOTFOUND,$result,32)) { .signal ALICE FAIL 5 | return }
tokenize 32 $result | var %w = $1, %datastart = $2, %dataend = $3, %datasize = $4, %newstart = $5
var %l = $fline(%w,You</b>:*,1,0) | if (!%l) { .signal ALICE FAIL 6 | return }
.signal ALICE OK $gettok($line(%w,%l),3-,58)
window -c %w | WR_Remove %cid %rid | return
:%j
}
; === WWW PAGE RETRIEVER MODULE ===
alias -l WR_rs { return $eval($str($ $+ rand(a,z) $ $+ + $ $+ + $chr(32),$1),2) }
alias -l WR_rw { var %w = @WR_TMP $+ $WR_rs(8) | window -h %w | return %w }
on *:UNLOAD: {
sockclose WR_* | close -@ @WR_* | if ($hget(WR_H)) { hfree WR_H }
}
; +++ WWW PAGE RETRIEVER / HEADER SETUP
; Input: $1 = ADD or REMOVE $2 = connection id or <ALL> $3 = request id or <ALL> $4- = line content (case $1 = ADD)
alias WR_Header {
var %cid = $2, %rid = $3 | goto $1
:ADD | var %w = $+(@WR_Header_,%cid,_,%rid) | if (!$window(%w)) { window -he %w } | aline -p %w $4- | return
:REMOVE
if (* isin %cid %rid) || (? isin %cid %rid) { return }
if (%cid == <ALL>) { var %cid = * }
if (%rid == <ALL>) { var %rid = * }
close -@ $+(@WR_Header_,%cid,_,%rid)
}
; +++ WWW PAGE RETRIEVER / START
; Input: $1 = connection id $2 = request id $3 = site $4 = port ($4 optional, defaults to 80)
; Output:
alias WR_Start {
if ($4 isnum) { var %port = $4 } | else { var %port = 80 }
var %sockname = WR_ $+ $1
if (!$sock(%sockname)) { sockopen %sockname $3 %port | sockmark %sockname $2 $3 }
else { WR_SendHeader %sockname }
}
; +++ WWW PAGE RETRIEVER / SEND REQUEST HEADER
; Input: $1 = sockname (assumes its connected)
; Output:
alias WR_SendHeader {
var %cid = $gettok($1,-1,95), %rid = $gettok($sock($1).mark,1,32), %w = $+(@WR_Header_,%cid,_,%rid)
var %total = $line(%w,0), %i = 1 | while (%i <= %total) { sockwrite -n $1 $line(%w,%i) | inc %i } | window -c %w
}
; +++ WWW PAGE RETRIEVER / SOCKET EVENTS
on *:SOCKOPEN:WR_*:{
var %cid = $gettok($sockname,-1,95) | tokenize 32 $sock($sockname).mark
if ($sockerr) {
var %m = 04WWWPAGERETRIEVER:04 $sock($sockname).ip - Connection attempt to04 $2 failed ( $+ $sock($sockname).wsmsg $+ )
.signal WR_ FAIL %cid $1 %m | return
}
WR_Remove %cid $1 | WR_SendHeader $sockname
}
on *:SOCKCLOSE:WR_*: {
var %cid = $gettok($sockname,-1,95) | tokenize 32 $sock($sockname).mark
var %hip = $+(Page_,%cid,_,$1) | .echo -q $hget(WR_H,%hip,&unitemp)
var %m = WWWPAGERETRIEVER:04 $sock($sockname).ip - Connection to04 $2 closed remotely after04 $bvar(&unitemp,0) bytes.
bunset &unitemp | .signal WR_ OK %cid $1 %m
}
on *:SOCKREAD:WR_*: {
var %cid = $gettok($sockname,-1,95) | tokenize 32 $sock($sockname).mark
var %hip = $+(Page_,%cid,_,$1)
if ($sockerr) {
if ($hget(WR_H)) { hdel WR_H %hip }
var %m = 04WWWPAGERETRIEVER:04 $sock($sockname).ip - Read - $sock($sockname).wsmsg $+ - aborted.
sockclose $sockname | .signal WR_ FAIL %id %m | return
}
sockread -f $sock($sockname).rq &buffer | if (!$sockbr) { return }
.echo -q $hget(WR_H,%hip,&unitemp) | bcopy &unitemp $calc($bvar(&unitemp,0) + 1) &buffer 1 -1
hadd -mb WR_H %hip &unitemp | bunset &unitemp
}
; +++ WWW PAGE RETRIEVER / REMOVE FROM PAGEBUFFER
; Input: $1 = connection id or <ALL> $2 = request id or <ALL>
alias WR_Remove {
if (* isin $1-) || (? isin $1-) { return }
var %cid = $1, %rid = $2
if (%cid == <ALL>) { var %cid = * }
if (%rid == <ALL>) { var %rid = * }
if ($hget(WR_H)) { hdel -w WR_H $+(Page_,%cid,_,%rid) }
}
; +++ WWW PAGE RETRIEVER / FILTER PAGEBUFFER
; Input (separator 32): $1 = amount characters/@line $2 = connection id $3 = request id $4 = position to start search $5 = position to stop (0 means none)
; $6 = ascii code tag separator character $7- = (separator $6): $1 = tag1 $2 = tag2 ($6- optional)
; Output: $result = NOPAGE or NOTFOUND or $1 = name of @window with line 1 = url and lines 2+ = webpage data
; $2 = start position of datachunk between tags $3 = end position $4 = datachunk size $5 = next search offset
alias WR_Filter {
var %lsize = $1, %cid = $2, %rid = $3, %hip = $+(Page_,%cid,_,%rid), %h = WR_H | if ($hget(%h,%hip) == $null) { return NOPAGE }
var %start = $4, %stop = $5
if ($7 != $null) { tokenize $6 $7- | var %tag1 = $1, %tag2 = $2, %tag1len = $len(%tag1), %tag2len = $len(%tag2) } | else { var %tag1, %tag2 }
var %count = 0, %w = $WR_rw
aline -p %w ConnectionID: %cid RequestID: %rid
.echo -q $hget(%h,%hip,&page) | var %pagesize = $bvar(&page,0), %dstart = 0, %chunkend = 0, %offset = %start
if (%tag1 == $null) {
var %chunkstart = %start
if (!%stop) { var %chunksize = %pagesize } | else { var %chunksize = $calc(%stop - %start + 1) }
goto output
}
:nextframe
var %postag1 = $bfind(&page,%offset,%tag1).text
if (!%postag1) || ((%stop) && ($calc(%postag1 + %tag1len) >= %stop)) { goto end }
if (%tag1 == %tag2) {
var %chunkstart = $calc(%postag1 + %tag1len), %postag2 = $bfind(&page,%chunkstart,%tag2).text
if (!%postag2) || ((%stop) && ($calc(%postag2 + %tag2len) >= %stop)) { goto end }
}
else {
var %postag2 = $bfind(&page,%offset,%tag2).text
if (!%postag2) || ((%stop) && ($calc(%postag2 + %tag2len) >= %stop)) { goto end }
if (%postag2 < %postag1) { var %offset = %postag1 | goto nextframe }
var %chunkstart = $calc(%postag1 + %tag1len)
}
if (%stop) && (%chunkstart > %stop) { goto end }
var %chunkend = $calc(%postag2 - 1), %chunksize = $calc(%chunkend - %chunkstart + 1)
:output
if (!%dstart) { var %dstart = %chunkstart }
var %i = %chunkstart, %chunkend = $calc(%i + %chunksize - 1) | bunset &line
while (%i <= %chunkend) {
var %a = $bvar(&page,%i,1) | if (%a < 32) { var %a = 127 }
bset &line $calc($bvar(&line,0) + 1) %a | if ($bvar(&line,0) == %lsize) { aline -pi %w $bvar(&line,1,%lsize).text | bunset &line }
inc %i
}
if ($bvar(&line,0) > 0) { aline -pi %w $bvar(&line,1,$v1).text | bunset &line }
if (%tag1 == $null) { goto end }
if (%tag1 != %tag2) { var %offset = $calc(%postag2 + %tag2len) } | else { var %offset = %postag2 }
goto nextframe
:end
if ($line(%w,0) > 1) { return %w %dstart %chunkend $calc(%chunkend - %dstart + 1) %offset } | window -c %w | return NOTFOUND
}