#!/usr/bin/env python # -*- coding: utf-8 -*- """ Pyfibot START Knowledge Parser @author Ville 'tuhoojabotti' Lahdenvuo (http://www.tuhoojabotti.com/) @copyright Copyright (c) 2011 Ville Lahdenvuo @licence BSD """ import urllib import re import htmlentitydefs import sys import os import logging import yaml # Initialize logger log = logging.getLogger('ask') """Config module_ask.conf: sentences: 1 - How many sentences of the output to print if it's longer than max length maxlength: 150 - How many chars is the max length of output Note: A shortlink will be applied after the maxlength e.g. See http://href.fi/xxx for more. """ def init(botconfig): global askconfig # Read configuration configfile = os.path.join(sys.path[0], 'modules', 'module_ask.conf') askconfig = yaml.load(file(configfile)) def command_ask(bot, user, channel, args): """Ask a question from the START (http://start.csail.mit.edu/) Usage: .ask """ # SPAM! return bot.say(channel, getSTARTReply(args)) def getSTARTReply(q): if len(q) < 3 or not q: return "Your argument is invalid." # Some variables sentences = askconfig.get('sentences', 1) absmaxlen = askconfig.get('maxlength', 120) url = "http://start.csail.mit.edu/startfarm.cgi?QUERY=%s" % urllib.quote_plus(q) # For parsing answers = [] media = False # Do we have media such as js, img in the results fails = re.compile("(KNOW-DONT-KNOW|DONT-KNOW|UNKNOWN-WORD|MISSPELLED-WORD|CANT-PARSE|FORBIDDEN-ASSERTION|LEXICON)") medias = re.compile("doctype|click|map|below", re.IGNORECASE) # Retrieve data from the internet service bs = getUrl(url).getBS() if not bs: return "Failed to contact START. Try again later." # Find useful tags from the HTML mess. (Those spans with no child spans with the quality T.) data_tags = [tag for tag in bs(name='span', attrs={'type': 'reply', 'quality': 'T'}) if len(tag(name="span", attrs={'type': 'reply', 'quality': 'T'})) == 0] if len(data_tags) == 0: # Find tags about the users fail fail_tags = [tag for tag in bs(name="span", attrs={'type': 'reply', 'quality': fails}) if len(tag(name="span", attrs={'type': 'reply', 'quality': fails})) == 0] if len(fail_tags) == 0: log.debug("Failed to parse data from:") log.debug(bs) log.debug("data: %s" % data_tags) log.debug("fails: %s" % fail_tags) return "Failed to parse data. :/" else: # Let's return the fail tag then. s = "".join([tag for tag in fail_tags[0](text=True) if type(tag) != Comment and re.search("Accept|Abort", tag) is None]) s = re.sub("<.*?>", "", s) # Remove possibly remaining HTML tags (like BASE) that aren't parsed by bs s = re.sub("\n|\r|\t| ", " ", s).strip(' \t') # One-line it. s = re.sub("[ ]{2,}", " ", s) # Compress multiple spaces into one s = unescape(s) # Clean up hex and html escaped chars if len(s) > absmaxlen: s = s[:absmaxlen].split(' ') s.pop() s = " ".join(s) + "..." return unicode("Fail: " + s).encode('utf-8') else: for answer in data_tags: # Cleanups on html depth [sup.replaceWith(("^%s" % sup.string) if sup.string is not None else " ") for sup in answer.findAll('sup')] # Handle ^{tags
[br.replaceWith(" ") for br in answer.findAll('br')] # Handle
tags
[td.extract() for td in answer.findAll('td') if len("".join(td.findAll(text=True))) < 10] # Handle data
[cm.extract() for cm in answer.findAll(text=lambda text:isinstance(text, Comment))] # Handle

# Find media by looking for tags like img and script and words like doctype, map, click (It sometimes embeds a whole HTML-document to the results. :S)
if len(answer.findAll({"img": True, "script": True})) > 0 or medias.search("".join(answer(text=True))) is not None:
media = True
# Cleanups on string depth
s = "".join(answer(text=True))
s = re.sub("<.*?>", "", s) # Remove possibly remaining HMTL tags (like BASE) that aren't parsed by bs
s = re.sub("\n|\r|\t| ", " ", s).strip(' \t') # One-line it.
s = re.sub("[ ]{2,}", " ", s) # Compress multiple spaces into one
s = unescape(s) # Clean up hex and html escaped chars

answers.append(s)

# Try to find suitable data for IRC
try:
answer = min((ans for ans in answers if len(ans) > 10 and not medias.search(ans)), key=len)
except:
if media is False:
return "Sorry, I don't know"
else:
return "Take a look at %s :P" % shorturl(url).encode("utf-8")

# Crop long answer...
if len(answer) > absmaxlen:
# It's longer than absolute max chars, try splitting first n sentences.
answer = ". ".join(answer.split(". ")[:sentences]) + "."

# It's still too long, so we'll split by word. :/
if len(answer) > absmaxlen:
answer = answer[:absmaxlen].split(" ")
answer.pop()
answer = " ".join(answer)

answer = "%s – See %s for more." % (answer, shorturl(url))

# It's not too long, but additional media is available, so let's give a link. :)
elif media is True:
answer = "%s – See %s for media." % (answer, shorturl(url))
return unicode(unescape(answer)).encode('utf-8')

def unescape(text):
"""Unescape ugly wtf-8-hex-escaped chars"""
def fixup(m):
text = m.group(0)
if text[:2] == "&#":
# character reference
try:
if text[:3] == "&#x":
return unichr(int(text[3:-1], 16))
else:
return unichr(int(text[2:-1]))
except ValueError:
pass
else:
# named entity
try:
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
except KeyError:
pass
return text # leave as is
return re.sub("&#?\w+;", fixup, text)

def shorturl(url):
try:
return urllib.urlopen("http://href.fi/api.php?%s" % urllib.urlencode({'create': url})).read()
except: # If something fails
return url}