| Current Path : /home/mudbot/eggdrop/ |
| Current File : //home/mudbot/eggdrop/lilyurl1.2.tcl |
# URL 2 IRC by Lily (starlily@gmail.com)
# Scans links in IRC channels and returns titles and tinyurl, and logs to a webpage.
# Will tag weblog entries NSFW if that appears in the line with the link.
# Has duplicate link detection - displays newest entry only, with link count and poster list.
# For Eggdrop bots. Has only been tested on eggdrop 1.6.19+
# Requires TCL8.4 or greater with http, htmlparse and tls, and the sqlite3 tcl lib. (not mysql!)
# For deb/ubuntu, the packages needed are tcllib and libsqlite3-tcl. Adjust for your flavor.
# You must ".chanset #channel +url2irc" for each chan you wish to use this in.
# This needs to be set to a bot writable dir for the web log pages.
set url2irc(path) /var/www/html/urllog ;# path to bot writable dir for web log pages
# Optional space separated list of domains/URLs/keywords to ignore. Entries are * expanded both ways, you have been warned.
set url2irc(iglist) "lemonparty.org decentsite.tld/somepath/terriblepicture.jpg"
# You may want to change these, but they are set pretty well.
set url2irc(maxdays) 7 ;# maximum number of days to save on log page
set url2irc(tlength) 90 ;# minimum url length for tinyurl (tinyurl is 18 chars..)
set url2irc(pubmflags) "-|-" ;# user flags required for use
# Fine tuning, safe to ignore.
set url2irc(ignore) "bdkqr|dkqr" ;# user flags script will ignore
set url2irc(length) 12 ;# minimum url length for title (12 chars is the shortest url possible, equalling all)
set url2irc(clength) 90 ;# log page url display length
set url2irc(delay) 2 ;# minimum seconds between use
set url2irc(timeout) 90000 ;# geturl timeout
set url2irc(maxsize) 1048576 ;# max page size in bytes
# 01 - Basic features set 20090521
# 02 - Build logger web page function and title regexp cleanup 20101130
# 05 - Fix logger for multiple chans, user agent string, web dir check 20101204
# 07 - s/regexp/htmlparse/, truncate long url display on log page 20101212
# 09 - some ::http cleanup volunteered by Steve (thanks!) 20110220: Check Content-Type; only get title for text pages
# - under maxsize, otherwise display mime-type. Follow http redirect (not meta refresh or javascript).
# 10 - converted to sqlite3, main loop cleanup 20110305
# 11 - secure URL handler, NSFW tagger 20110308
# 12 - site blacklist, comment cleanup, chanflag removal (just one now) 20110309
# 14 - post counts, day dividers, dupe detection - display newest w/ counter and poster list, carryover NSFW flag. 20110110
# 15 - logger index.html 20110311
# 16 - cleanups, 1.0 version for egghelp.
# 1.2 - fixed shortlink redirects, dbfile varname change, 20120821
# TODO: urlsearch, sticky/hide
# BUGS: alt langs (fixed in tcl8.5?)
################################################
package require http
package require htmlparse
package require tls
package require sqlite3
set url2irc(last) 111
set udbfile "./urllog.db"
setudef flag url2irc
bind pubm $url2irc(pubmflags) {*://*} pubm:url2irc
proc pubm:url2irc {nick host user chan text} {
global url2irc
global udbfile
global botnick
set url2irc(redirects) 0
if {([channel get $chan url2irc]) && ([expr [unixtime] - $url2irc(delay)] > $url2irc(last)) && (![matchattr $user $url2irc(ignore)])} {
regsub "#" $chan "" cname
if {[string match -nocase "*nsfw*" $text]} { set lflag NSFW } else { set lflag {} }
foreach word [split $text] {
if {[string length $word] >= $url2irc(length) && [regexp {^(f|ht)tp(s|)://} $word] && ![regexp {://([^/:]*:([^/]*@|\d+(/|$))|.*/\.)} $word]} {
foreach item $url2irc(iglist) {
if {[string match "*$item*" $word]} {return 0}
}
set url2irc(last) [unixtime]
if {[string length $word] >= $url2irc(tlength)} {
set newurl [tinyurl $word]
} else { set newurl "" }
set urtitle [urltitle $word]
if {[string length $newurl]} {
puthelp "PRIVMSG $chan :\002$urtitle\002 ( $newurl ), linked by $nick"
} else { puthelp "PRIVMSG $chan :\002$urtitle\002, linked by $nick" }
set lTime [clock seconds]
sqlite3 ldb $udbfile
ldb eval {CREATE TABLE IF NOT EXISTS urllog (lTime INTEGER,lchan TEXT,lnick TEXT,lurl TEXT,ltitle TEXT,lflag TEXT)}
ldb eval {INSERT INTO urllog (lTime, lchan, lnick, lurl, ltitle, lflag)VALUES($lTime,$cname,$nick,$word,$urtitle,$lflag)}
ldb close
}
}
if {[file isdirectory $url2irc(path)] && [file writable $url2irc(path)]} {
sqlite3 ldb $udbfile
set rtime [expr [clock seconds] - ($url2irc(maxdays) * 86400)]
ldb eval {DELETE FROM urllog WHERE lTime < $rtime}
set logday 0000
set htmlpage [ open "$url2irc(path)/$cname.html" w+ ]
puts $htmlpage "<html><head><meta http-equiv=\"refresh\" content=\"600\" /><title>URL Log for $chan</title><head>"
set lcount [ldb eval {SELECT COUNT(distinct lurl) FROM urllog where lchan = $cname}]
set tdays [expr (([clock seconds] - [ldb eval {SELECT lTime FROM urllog where lchan = $cname order by rowid asc limit 1}]) / 86400) +1]
puts $htmlpage "<body bgcolor=white><p><h1>URL Log for $chan</h1>$lcount URLs in $tdays days<br><small>This page reloads itself every 5 minutes.<br>Go back to the <a href=\"./index.html\">Index</a></small></p><p>Date - Time - <i>Nick</i> - URL<br><b>Title</b>"
foreach lrowid [ldb eval {SELECT rowid FROM urllog WHERE lchan = $cname order by rowid desc}] {
set lrurl [ldb eval {SELECT lurl FROM urllog where rowid = $lrowid }]
set lrucount [ldb eval {SELECT COUNT(1) FROM urllog where lurl = $lrurl AND lchan = $cname}]
set lrnick [ldb eval {SELECT DISTINCT lnick FROM urllog where lurl = $lrurl AND lchan = $cname}]
if {$lrucount > 1 } {
if {[ldb eval {SELECT rowid from urllog where lurl = $lrurl AND lchan = $cname order by rowid desc limit 1}]!=$lrowid} {
continue } else {
regsub -all " " $lrnick "/" lrnick
set plrnick "linked $lrucount times by $lrnick"
}
} else {set plrnick $lrnick}
set lrtitle [ldb onecolumn {SELECT ltitle FROM urllog where rowid = $lrowid }]
set lrTime [ldb eval {SELECT lTime FROM urllog where rowid = $lrowid }]
set tstamp [clock format $lrTime -format {%b. %d - %H:%M}]
if {[ldb eval {SELECT COUNT(1) FROM urllog where lurl = $lrurl AND lchan = $cname and lflag like '%NSFW%'}]} {
set lrf " - (marked <font color=\"red\"><b>NSFW</b></font>)"} else {set lrf ""}
if {[string length $lrurl] >=$url2irc(clength)} {
set plrurl "[string replace $lrurl $url2irc(clength) end ] ..."
} else { set plrurl $lrurl }
if {[clock format $lrTime -format {%m%d}] != $logday} {
set plogday [clock format $lrTime -format {%A %B %d}]
puts $htmlpage "<p><center><b>$plogday</b></center></p><hr>"
set logday [clock format $lrTime -format {%m%d}]
}
puts $htmlpage "<p>$tstamp - <i>$plrnick</i> - <a href=\"$lrurl\">$plrurl</a><br><b>$lrtitle</b>$lrf<hr>"
}
puts $htmlpage "<center><small><b>URL 2 IRC</b> by Lily</small></center></body></html>"
close $htmlpage
set indexpage [ open "$url2irc(path)/index.html" w+ ]
puts $indexpage "<html><head><meta http-equiv=\"refresh\" content=\"600\" /><title>URL Log Index for $botnick</title><head>"
set ilcount [ldb eval {SELECT COUNT(distinct lurl) FROM urllog}]
set ichanc [ldb eval {SELECT COUNT(distinct lchan) FROM urllog}]
puts $indexpage "<body bgcolor=white><p><h1>URL Log Index for $botnick</h1>$ilcount URLs in $ichanc channels<br><small>This page reloads itself every 5 minutes.</small></p><hr>"
foreach chanid [ldb eval {SELECT distinct lchan FROM urllog}] {
set ilcount [ldb eval {SELECT COUNT(distinct lurl) FROM urllog where lchan = $chanid}]
set ilTime [ldb eval {SELECT lTime from urllog where lchan = $chanid order by rowid desc limit 1}]
set itstamp [clock format $ilTime -format {%B %d at %H:%M}]
set iltitle [ldb onecolumn {SELECT ltitle from urllog where lchan = $chanid order by rowid desc limit 1}]
puts $indexpage "<p><a href=\"./$chanid.html\">\#$chanid</a> - $ilcount URLs - last link posted $itstamp<br>Last link title: <b>$iltitle</b></p><hr>"
}
puts $indexpage "<center><small><b>URL 2 IRC</b> by Lily</small></center></body></html>"
close $indexpage
ldb close
} else {
putlog "Web log path not valid! Not writing html files."
}
}
}
proc urltitle {url} {
global url2irc
set agent "Mozilla/5.0 (X11; Linux i686; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"
if {[info exists url] && [string length $url]} {
::http::register https 443 ::tls::socket
set http [::http::config -useragent $agent]
if {[catch {::http::geturl $url -timeout $url2irc(timeout) -validate 1} http]} {
set status [::http::status $http]
::http::cleanup $http
return $status
}
array set meta [::http::meta $http]
::http::cleanup $http
if {[info exists meta(Location)] && [incr url2irc(redirects)] < 10} {
return [urltitle $meta(Location)]
}
if {[info exists meta(Redirect)] && [incr url2irc(redirects)] < 10} {
return [urltitle $meta(Redirect)]
}
if {[info exists meta(Content-Type)]} {
set content_type [lindex [split $meta(Content-Type) ";"] 0]
} else {
set content_type "Unknown"
}
if {[info exists meta(Content-Length)]} {
set content_length $meta(Content-Length)
} else {
set content_length 0
}
if {$content_length <= $url2irc(maxsize) && [string match -nocase "text/*" $content_type]} {
set http [::http::config -useragent $agent]
if {[catch {::http::geturl $url -timeout $url2irc(timeout)} http]} {
::http::cleanup $http
return $content_type
}
set data [split [::http::data $http] \n]
::http::cleanup $http
}
set title ""
if {[info exists data] && [regexp -nocase {<title>(.*?)</title>} $data match title]} {
set title [::htmlparse::mapEscapes $title]
regsub -all {[\{\}\\]} $title "" title
regsub -all " +" $title " " title
set title [string trim $title]
return $title
} else {
return "$content_type"
}
}
}
proc tinyurl {url} {
global url2irc
if {[info exists url] && [string length $url]} {
set http [::http::geturl "http://tinyurl.com/create.php" -query [::http::formatQuery "url" $url] -timeout $url2irc(timeout)]
set data [split [::http::data $http] \n] ; ::http::cleanup $http
for {set index [llength $data]} {$index >= 0} {incr index -1} {
if {[regexp {href="http://tinyurl\.com/\w+"} [lindex $data $index] url]} {
return [string map { {href=} "" \" "" } $url]
}
}
}
return ""
}
putlog "URL 2 IRC script loaded.."