#!/usr/bin/env ruby # @author apoc # @license GPLv3 # Test Implementation of "Density" Content Extraction # # Algorithm by Javier Arias Moreno, Koen Deschacht and Marie-Francine Moens # https://www2.cs.kuleuven.be/cwis/research/liir/publication_files/978AriasEtAl2009.pdf # require 'rubygems' require 'mechanize' # includes nokogiri as robust xml parser class DensityCE attr_reader :c1, :c2 # Tags that alter the structure of the HTML document STRUCTUAL_TAGS = ['p', 'table', 'br', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'br', 'form'] # initialize constants def initialize(c1 = 0.333, c2 = 4) @c1 = c1 @c2 = c2 end # process document and create high density region # @params Nokogiri::HTML::Document def process(document) # create text string array @texts = [] # @paper: L = {s1 ... sn} @texts << '' document.traverse do |node| @texts[-1] += node.text if node.text? @texts << '' if STRUCTUAL_TAGS.include? node.name end # selecting the highest density region texts_max_str = 0 texts_max_len = 0 texts_max_idx = 0 @texts.each_index do |idx| if @texts[idx].length > texts_max_len texts_max_len = @texts[idx].length texts_max_idx = idx texts_max_str = @texts[idx] end end texts_cutoff_len = texts_max_len * @c1 @region = [ texts_max_str ] @region_texts_idxs = [ texts_max_idx ] # array of text indices @ texts @texts.each_index do |i_idx| if @texts[i_idx].length > texts_cutoff_len @region_texts_idxs.each do |j_idx| if i_idx - j_idx < @c2 and not @region.include? @texts[i_idx] @region << @texts[i_idx] @region_texts_idxs << j_idx break end end end end end # returning plaintext result of density algorithm def plaintext return_txt = '' @texts.each do |txt| return_txt += txt if @region.include? txt end return_txt end end agent = WWW::Mechanize.new site = agent.get 'http://sixserv.org/2009/11/09/content-extraction-algorithmen-densityccb/' # create density ce with default constants: density = DensityCE.new(0.333, 15) density.process site.root # returning plaintext puts density.plaintext