require 'active_support/concern' module HasParseableText extend ActiveSupport::Concern included do def plaintext @plaintext ||= Documents::PlaintextService.from_html(self.body) end def characters @characters ||= plaintext.chars end def paragraphs @paragraphs ||= begin # Normalize text ## We use paragraph tags by default, but people might paste in divs also paragraphed_sanity = ActionController::Base.helpers.sanitize(body, tags: %w(div p), attributes: %w()) paragraphed_sanity.gsub!('
', '') paragraphed_sanity.gsub!('

', '') paragraphs = paragraphed_sanity.scan(/

[^<]+<\/p>/).map { |text| ActionView::Base.full_sanitizer.sanitize(text) } paragraphs << paragraphed_sanity.scan(/

[^<]+<\/div>/).map { |text| ActionView::Base.full_sanitizer.sanitize(text) } end.flatten end def sentences @sentences ||= plaintext.strip.split(/[!\?\.]/).reject(&:empty?).map { |sentence| sentence.gsub("\n", ' ') } end def words @words ||= plaintext.downcase.gsub(/[^\s\w\d']/, '').split(' ') end def pages # todo this might make more sense as a word count splitter instead of lines? @pages ||= plaintext.split("\n").each_slice(Documents::PlaintextService::PLAINTEXT_LINES_PER_PAGE) end def acronyms @acroynyms ||= words .select { |word| word == word.upcase && word.length > 1 && !is_numeric?(word) } .uniq .sort end # As defined by Robert Gunning in the GFI and SMOG def complex_words @complex_words ||= unique_words.select { |word| Documents::Analysis::SyllablesService.count(word) >= 3 } end def simple_words @simple_words ||= unique_words - complex_words end def unique_words words.map(&:downcase).uniq end def words_with_syllables syllable_count words.select { |word| Documents::Analysis::SyllablesService.count(word) == syllable_count } end def word_syllables words.map { |word| Documents::Analysis::SyllablesService.count(word) } end def is_numeric?(string) true if Float(string) rescue false end end end