webpage retriever module

By Qc on Jan 20, 2007


Retrieving a webpage and extracting the required data ("parsing") is a very common need, but sometimes it is dead easy and sometimes it is quite hard, due to the page layout:

  • Sometimes the 'line' separator isn't a carriage return line feed (CRLF), so mIRC doesn't 'see' a new line.
  • Sometimes lines are longer than mIRCs string length limit, requiring the use of binary variables (not that easy) or forced sockreads (which may break up in the middle of a word).
  • Sometimes there simply the 'line' separator even isn't a single character but some string, requiring binary variables.

This snippet acts as a sort of 'script-interface', providing one single approach that handles all these problems 'internally', saving the calling script from all this.

USAGE (or move directly to the example further on):

1) You call WR_Start with parameters:

  • $1 = connection id CID
  • $2 = request id RID
  • $3 = site
  • $4 = port (optional, defaults to 80)

NOTE: CID and RID cannot have underscores _ or wildcard symbols * and ?

2) When it has done, it does a /signal WR_ with parameters:

  • $1 = OK or FAIL
  • $2 = connection id
  • $3 = request id
  • $4- = message to display
  • hashtable WRH
    Item PAGE
    _ contains the page in binary form
    Warning: this hash is global, if you access it directly, be careful to not alter other items

3) Now you can call WR_Filter with parameters

  • $1 = max. amount characters per custom window line
  • $2 = connection id
  • $3 = request id
  • $4 = position to start search
  • $5 = position to stop (0 means none)
    Optionally 2 tag strings to filter only data between them (for example, to filter out tables):
  • $6 = ascii code of a tags separator character, it has to be a character that does NOT occur inside the code tags itself.
  • $7- = (separator $6): $1 = tag1 $2 = tag2
    These 2 tags define the strings.
    If only one (an ending carriage return line feed $crlf is commonly used) then use same values for both.

If no tags specified it will filter the entire content within the given start/stop positions

This alias returns following parameters after tokenize 32 $result:

  • $1 = NOTFOUND or NOPAGE or name of temp @window with line1 = ConnectionID: RequestID:
    Case $1 = name of temp @window:
  • $2 = start position of datachunk between tags
  • $3 = end position
  • $4 = datachunk size
  • $5 = new offset for a next filter's start position

EDIT 2007/02/04:
Changed to work with given http headers.
Example changed to suit this.


; This example inputs to a AI bot and returns its answer in status window.
; Usage: /alice sometexthere

alias Alice {
  set %Alice_input $1-
  var %cid = pandora, %rid = Alice1
  var %c = WR_Header ADD %cid %rid
  %c GET /pandora/talk?botid=b9b96b247e34f4f2 HTTP/1.1
  %c Host: www.pandorabots.com
  %c User-Agent: Mozilla/5.0 (Windows; U; Win98; en-US; rv:1.6) Gecko/20040206 Firefox/0.8
  %c Accept: application/x-shockwave-flash,text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,video/x-mng,image/png,image/jpeg,image/gif;q=0.2,*/*;q=0.1
  %c Accept-Language: en-us,en;q=0.5
  %c Accept-Encoding: gzip,deflate
  %c Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7
  %c Keep-Alive: 300
  %c Connection: keep-alive
  %c $crlf
  WR_Start %cid %rid www.pandorabots.com
alias Alice2 {
  var %cid = pandora, %rid = Alice2
  var %content = $+($1-,&message=,%Alice_input), %content = $replace(%content,$chr(32),+)

  var %c = WR_Header ADD %cid %rid
  %c POST /pandora/talk?botid=b9b96b247e34f4f2 HTTP/1.1
  %c Host: www.pandorabots.com
  %c User-Agent: Mozilla/5.0 (Windows; U; Win98; en-US; rv:1.6) Gecko/20040206 Firefox/0.8
  %c Accept: application/x-shockwave-flash,text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,video/x-mng,image/png,image/jpeg,image/gif;q=0.2,*/*;q=0.1
  %c Accept-Language: en-us,en;q=0.5
  %c Accept-Encoding: gzip,deflate
  %c Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7
  %c Connection: Close
  %c Referer: http://www.pandorabots.com/pandora/talk?botid=b9b96b247e34f4f2
  %c Content-Type: application/x-www-form-urlencoded
  %c Content-Length: $len(%content)
  %c $+($crlf,%content)

  WR_Start %cid %rid www.pandorabots.com
on *:SIGNAL:WR_: {
  var %result = $1, %cid = $2, %rid = $3, %msg = $4-
  var %j = $+(%cid,_,%rid) | goto %j

  if (%result == FAIL) { .signal ALICE FAIL 1 | return }
  WR_Filter 500 %cid %rid 1 0 32 $+(Set-Cookie:,$chr(32),;)
  if ($istok(NOPAGE NOTFOUND,$result,32)) { .signal ALICE FAIL 2 | return }
  tokenize 32 $result | var %w = $1, %datastart = $2, %dataend = $3, %datasize = $4, %newstart = $5
  var %l = $fline(%w,botcust2=*,1,0) | if (!%l) { .signal ALICE FAIL 3 | return }
  Alice2 $line(%w,%l) | window -c %w | WR_Remove %cid %rid | return

  if (%result == FAIL) { .signal Alice FAIL 4 | return }
  WR_Filter 500 %cid %rid 1 0 58 $+(<b>,:,<br> <br>)
  if ($istok(NOPAGE NOTFOUND,$result,32)) { .signal ALICE FAIL 5 | return }
  tokenize 32 $result | var %w = $1, %datastart = $2, %dataend = $3, %datasize = $4, %newstart = $5
  var %l = $fline(%w,You</b>:*,1,0) | if (!%l) { .signal ALICE FAIL 6 | return }
  .signal ALICE OK $gettok($line(%w,%l),3-,58)
  window -c %w | WR_Remove %cid %rid | return



alias -l WR_rs { return $eval($str($ $+ rand(a,z) $ $+ + $ $+ + $chr(32),$1),2) }
alias -l WR_rw { var %w = @WR_TMP $+ $WR_rs(8) | window -h %w | return %w }
on *:UNLOAD: {
  sockclose WR_* | close -@ @WR_* | if ($hget(WR_H)) { hfree WR_H }

; Input: $1 = ADD or REMOVE $2 = connection id or <ALL> $3 = request id or <ALL> $4- = line content (case $1 = ADD)
alias WR_Header {
  var %cid = $2, %rid = $3 | goto $1
  :ADD | var %w = $+(@WR_Header_,%cid,_,%rid) | if (!$window(%w)) { window -he %w } | aline -p %w $4- | return
  if (* isin %cid %rid) || (? isin %cid %rid) { return }
  if (%cid == <ALL>) { var %cid = * }
  if (%rid == <ALL>) { var %rid = * }
  close -@ $+(@WR_Header_,%cid,_,%rid)

; Input: $1 = connection id $2 = request id $3 = site $4 = port ($4 optional, defaults to 80)
; Output:
alias WR_Start {
  if ($4 isnum) { var %port = $4 } | else { var %port = 80 }
  var %sockname = WR_ $+ $1
  if (!$sock(%sockname)) { sockopen %sockname $3 %port | sockmark %sockname $2 $3 }
  else { WR_SendHeader %sockname }

; Input: $1 = sockname (assumes its connected)
; Output:
alias WR_SendHeader {
  var %cid = $gettok($1,-1,95), %rid = $gettok($sock($1).mark,1,32), %w = $+(@WR_Header_,%cid,_,%rid)
  var %total = $line(%w,0), %i = 1 | while (%i <= %total) { sockwrite -n $1 $line(%w,%i) | inc %i } | window -c %w

on *:SOCKOPEN:WR_*:{
  var %cid = $gettok($sockname,-1,95) | tokenize 32 $sock($sockname).mark
  if ($sockerr) {
    var %m = 04WWWPAGERETRIEVER:04 $sock($sockname).ip - Connection attempt to04 $2 failed ( $+ $sock($sockname).wsmsg $+ )
    .signal WR_ FAIL %cid $1 %m | return
  WR_Remove %cid $1 | WR_SendHeader $sockname
on *:SOCKCLOSE:WR_*: {
  var %cid = $gettok($sockname,-1,95) | tokenize 32 $sock($sockname).mark
  var %hip = $+(Page_,%cid,_,$1) | .echo -q $hget(WR_H,%hip,&unitemp)
  var %m = WWWPAGERETRIEVER:04 $sock($sockname).ip - Connection to04 $2 closed remotely after04 $bvar(&unitemp,0) bytes.
  bunset &unitemp | .signal WR_ OK %cid $1 %m
on *:SOCKREAD:WR_*: {
  var %cid = $gettok($sockname,-1,95) | tokenize 32 $sock($sockname).mark
  var %hip = $+(Page_,%cid,_,$1)
  if ($sockerr) { 
    if ($hget(WR_H)) { hdel WR_H %hip }
    var %m = 04WWWPAGERETRIEVER:04 $sock($sockname).ip - Read - $sock($sockname).wsmsg $+ - aborted.
    sockclose $sockname | .signal WR_ FAIL %id %m | return
  sockread -f $sock($sockname).rq &buffer | if (!$sockbr) { return }
  .echo -q $hget(WR_H,%hip,&unitemp) | bcopy &unitemp $calc($bvar(&unitemp,0) + 1) &buffer 1 -1
  hadd -mb WR_H %hip &unitemp | bunset &unitemp

; Input: $1 = connection id or <ALL> $2 = request id or <ALL>
alias WR_Remove {
  if (* isin $1-) || (? isin $1-) { return }
  var %cid = $1, %rid = $2
  if (%cid == <ALL>) { var %cid = * }
  if (%rid == <ALL>) { var %rid = * }
  if ($hget(WR_H)) { hdel -w WR_H $+(Page_,%cid,_,%rid) }

; Input (separator 32): $1 = amount characters/@line $2 = connection id $3 = request id $4 = position to start search $5 = position to stop (0 means none)
; $6 = ascii code tag separator character $7- = (separator $6): $1 = tag1 $2 = tag2 ($6- optional)
; Output: $result = NOPAGE or NOTFOUND or $1 = name of @window with line 1 = url and lines 2+ = webpage data
; $2 = start position of datachunk between tags $3 = end position $4 = datachunk size $5 = next search offset
alias WR_Filter {
  var %lsize = $1, %cid = $2, %rid = $3, %hip = $+(Page_,%cid,_,%rid), %h = WR_H | if ($hget(%h,%hip) == $null) { return NOPAGE }
  var %start = $4, %stop = $5
  if ($7 != $null) { tokenize $6 $7- | var %tag1 = $1, %tag2 = $2, %tag1len = $len(%tag1), %tag2len = $len(%tag2) } | else { var %tag1, %tag2 }
  var %count = 0, %w = $WR_rw
  aline -p %w ConnectionID: %cid RequestID: %rid
  .echo -q $hget(%h,%hip,&page) | var %pagesize = $bvar(&page,0), %dstart = 0, %chunkend = 0, %offset = %start
  if (%tag1 == $null) { 
    var %chunkstart = %start
    if (!%stop) { var %chunksize = %pagesize } | else { var %chunksize = $calc(%stop - %start + 1) }
    goto output
  var %postag1 = $bfind(&page,%offset,%tag1).text
  if (!%postag1) || ((%stop) && ($calc(%postag1 + %tag1len) >= %stop)) { goto end }

  if (%tag1 == %tag2) {
    var %chunkstart = $calc(%postag1 + %tag1len), %postag2 = $bfind(&page,%chunkstart,%tag2).text
    if (!%postag2) || ((%stop) && ($calc(%postag2 + %tag2len) >= %stop)) { goto end }
  else {
    var %postag2 = $bfind(&page,%offset,%tag2).text
    if (!%postag2) || ((%stop) && ($calc(%postag2 + %tag2len) >= %stop)) { goto end }
    if (%postag2 < %postag1) { var %offset = %postag1 | goto nextframe }
    var %chunkstart = $calc(%postag1 + %tag1len)
  if (%stop) && (%chunkstart > %stop) { goto end }
  var %chunkend = $calc(%postag2 - 1), %chunksize = $calc(%chunkend - %chunkstart + 1)

  if (!%dstart) { var %dstart = %chunkstart }
  var %i = %chunkstart, %chunkend = $calc(%i + %chunksize - 1) | bunset &line
  while (%i <= %chunkend) { 
    var %a = $bvar(&page,%i,1) | if (%a < 32) { var %a = 127 } 
    bset &line $calc($bvar(&line,0) + 1) %a | if ($bvar(&line,0) == %lsize) { aline -pi %w $bvar(&line,1,%lsize).text | bunset &line }
    inc %i
  if ($bvar(&line,0) > 0) { aline -pi %w $bvar(&line,1,$v1).text | bunset &line }
  if (%tag1 == $null) { goto end }
  if (%tag1 != %tag2) { var %offset = $calc(%postag2 + %tag2len) } | else { var %offset = %postag2 }
  goto nextframe
  if ($line(%w,0) > 1) { return %w %dstart %chunkend $calc(%chunkend - %dstart + 1) %offset } | window -c %w | return NOTFOUND


Sign in to comment.
Qc   -  Jan 27, 2007

It\'s coincidence, it was a page someone wanted to get data from, and since the goal of this snippet is to make that easier, I could quickly do that.

If you can\'t get it work, tell me the url of your webpage, which data you want to isolate, and I\'ll make another \'example\'.

Marshtomp   -  Jan 26, 2007

Sorry for double post,b ut i can\' work it! :X

Marshtomp   -  Jan 26, 2007

Is it a coincedince that it links to a dutch page? Ik vind het knap dat een nederlandse mens een engeks script kunt schijven :S

DarthReven   -  Jan 21, 2007

Pretty neat code. Overall its pretty interesting i\'d just recomend trying to consolodate the aliases if at all possable by using $prop and eather goto loops or using IF and ELSEIF statments

