#!/usr/bin/env python # ccbce - content code blurring content extraction # (Token!) Content Code Blurring Example Implementation # License: GPLv3 # Optional Requirement(for debug image): PIL # # This script/module implements the CCB algorithm for content extraction # as discribed by Thomas Gotton in his dissertation and papers: # # * http://ubm.opus.hbz-nrw.de/volltexte/2009/1859/pdf/diss.pdf # * http://www.informatik.uni-mainz.de/~gotti/paper/TIR-2008.pdf # # I also write an article in the sixserv blog about that (German): # # * ... # # This script has one major problem, the stop criterion for the gaussian # blur filter should as desicribed by Gottron begin to settle after some # iterations. I could not reproduce that, so I'm just using a fixed(adjustable # by cli argument) number of iterations. # # I also want to mention that I'm just learning python for a week or so, # so the code is maybe not that great. # print "Content Code Blurring Example Implementation" print "2009 (GPLv3) @ apoc " import getopt, sys, re, math, urllib, copy class Tccb(object): def __init__(self, t, r, g): self._threshold = t self._range = r self._gauss_iter = g self._tokens = None self._ccv = None self._ccr = None # create the gauss kernel self._genGaussKernel() def _genGaussKernel(self): print "Creating Gauss Kernel:", self._kernel = [0]*self._range s = float(self._range) / 2.0 for x in range(0, int(s)): self._kernel[x] = (1) / (math.sqrt(2 * math.pi * s ** 2)) * math.exp( -(x**2) / (2*s**2) ) self._kernel[self._range-1-x] = self._kernel[x] # mirror print ", ".join(map(str, self._kernel)) def _createContentCodeVector(self, url): try: site = urllib.urlopen(url) contents = site.read() except: print "Error in retriving url(%s)" % url sys.exit(2) # Normally there is a parameter for flags, but that is missing in sub # http://bytes.com/topic/python/answers/589791-dotall-not-working-expected script_re = re.compile(r']+>.*?', re.DOTALL) contents = script_re.sub('', contents) style_re = re.compile(r']+>.*?', re.DOTALL) contents = style_re.sub('', contents) comment_re = re.compile(r']+>', re.DOTALL) contents = comment_re.sub('', contents) # i think this sucks, but u know whatever... # this doesn't change that mutch, but it makes the last step a little harder #ws_re = re.compile(r'\n|\t', re.DOTALL) #contents = ws_re.sub('', contents) self._tokens = [] self._ccv = [] tmptoken = '' insidetag = False for c in contents: if c != '<': tmptoken += c if c == '>' or (not insidetag and c == ' ') or c == '<': self._tokens.append(tmptoken) self._ccv.append(0 if insidetag else 1) #print tmptoken+" <-> "+str(0 if insidetag else 1) # 1 => CONTENT 0 => CODE tmptoken = '' if c == '<': tmptoken += '<' if c == '<': insidetag = True elif c == '>': insidetag = False def _iterativeBlurring(self): # The Blurring process is done in an copied list: self._ccr = copy.copy(self._ccv) ccr_tmp = [0] * len(self._ccr) for gi in range(0, self._gauss_iter): for i in range(0, len(self._ccr)): ccr_tmp[i] = 0 for j in range(-self._range/2+1, self._range/2+1): if 0 < (i + j) < len(self._ccr): #print i + j adder = self._ccr[i + j] else: adder = 1 # try: w = self._kernel[j + self._range/2] #qw[j+hr] ccr_tmp[i] = ccr_tmp[i] + adder * w # except IndexError: # pass self._ccr = ccr_tmp #for i in range(0, len(self._ccr)): # print str(self._ccr[i]) +" <#> "+ self._tokens[i] #quit() def _createPlaintext(self): text = '' for i in range(0, len(self._ccv)): if self._ccv[i] == 1 and self._ccr[i] > self._threshold: text += self._tokens[i] return text def getPlaintext(self, url): self._createContentCodeVector(url) self._iterativeBlurring() return self._createPlaintext() class TccbDebugImage(Tccb): WIDTH = 715 HEIGHT = 255 def __init__(self, t, r, g, img_filename): #... Tccb.__init__(self, t, r, g) self._img_filename = img_filename from PIL import Image self._img = Image.new("RGB", (TccbDebugImage.WIDTH, TccbDebugImage.HEIGHT), (255, 255, 255)) # overwrite method, plot debug image _after_ blurring def _iterativeBlurring(self): Tccb._iterativeBlurring(self) from PIL import ImageDraw # plot ccr: ccr_plot = ImageDraw.Draw(self._img) ccr_plot.rectangle((10,10,TccbDebugImage.WIDTH-10,TccbDebugImage.HEIGHT-10), outline=(204, 204, 204)) #TccbDebugImage.HEIGHT-1 for x in range(10, TccbDebugImage.WIDTH-10): ix = x-10 # calculate steps # TODO: smaller pages! step = len(self._ccr) / (TccbDebugImage.WIDTH-10) start_ccr_idx = step * ix end_ccr_idx = step * ix + step # calculate avergenge value of ccr region ccr_range=self._ccr[start_ccr_idx:end_ccr_idx+1] ccr_avg = sum(ccr_range) / len(ccr_range) # ratio => pixel height size ccr_height = ccr_avg * 500 # draw line ccr_plot.line((x, TccbDebugImage.HEIGHT -10 -ccr_height , x, TccbDebugImage.HEIGHT-10), fill=(78,78,78)) del ccr_plot # save image: self._img.save(self._img_filename) def usage(): print print "Usage: " + sys.argv[0] + " [OPTION]... [URL]" print " -t, --threshold=VAL optional thershold value, default: 0.75" print " -r, --range=VAL optional range value, default: 25" print " -g, --gauss=VAL iterations for the gauss filter, default: 3" print " -d, --debug=STR plot CCV image to given filename" print " -h, --help display this help and exit" # This module can also be used as an library for CCB if run standalone # the script is controlled by various command line arguments. def main(): try: opts, args = getopt.getopt(sys.argv[1:], "ht:r:g:d:", ["help", "threshold=", "range=", "gauss=", "debug="]) except getopt.GetoptError, err: # print help information and exit: print str(err) # will print something like "option -a not recognized" usage() sys.exit(2) # default settings verbose = False t = 0.12 #0.75 r = 25 gauss = 3 debug_image = False try: for o, a in opts: if o in ("-h", "--help"): usage() sys.exit() elif o in ("-t", "--threshold"): t = float(a) elif o in ("-r", "--range"): r = int(a) elif o in ("-g", "--gauss"): gauss = int(a) elif o in ("-d", "--debug"): debug_image = a else: assert False, "unhandled option" except ValueError: print "Error in parsing parameters. Wrong type? (threshold=float range/gauss=int)" sys.exit(2) if not re.match(r'http(s?)://', sys.argv[-1]): print "Missing URL!" usage() sys.exit(2) url = sys.argv[-1] print "URL: "+url if not debug_image: tccb = Tccb(t, r, gauss) else: tccb = TccbDebugImage(t, r, gauss, debug_image) print tccb.getPlaintext(url) if __name__ == "__main__": main()