module Documents class PlaintextService < Service PLAINTEXT_LINES_PER_PAGE = 25 # From https://github.com/alexdunae/premailer/blob/master/lib/premailer/html_to_plain_text.rb def self.from_html(html, line_length = 80, from_charset = 'UTF-8') return "" if html.nil? txt = html.dup # strip text ignored html. Useful for removing # headers and footers that aren't needed in the # text version txt.gsub!(/.*?/m, '') # replace images with their alt attributes # for img tags with "" for attribute quotes # with or without closing tag # eg. the following formats: # # txt.gsub!(/]*\>/i, '\1') # for img tags with '' for attribute quotes # with or without closing tag # eg. the following formats: # # txt.gsub!(/]*\>/i, '\1') # links txt.gsub!(/]*>((.|\s)*?)<\/a>/i) do |s| if $3.empty? '' else $3.strip + ' ( ' + $2.strip + ' )' end end # handle headings (H1-H6) txt.gsub!(/(<\/h[1-6]>)/i, "\n\\1") # move closing tags to new lines txt.gsub!(/[\s]*]*>[\s]*(.*)[\s]*<\/h[1-6]+>/i) do |s| hlevel = $1.to_i htext = $2 htext.gsub!(//i, "\n") # handle
s htext.gsub!(/<\/?[^>]*>/i, '') # strip tags # determine maximum line length hlength = 0 htext.each_line { |l| llength = l.strip.length; hlength = llength if llength > hlength } hlength = line_length if hlength > line_length case hlevel when 1 # H1, asterisks above and below htext = ('*' * hlength) + "\n" + htext + "\n" + ('*' * hlength) when 2 # H1, dashes above and below htext = ('-' * hlength) + "\n" + htext + "\n" + ('-' * hlength) else # H3-H6, dashes below htext = htext + "\n" + ('-' * hlength) end "\n\n" + htext + "\n\n" end # wrap spans txt.gsub!(/(<\/span>)[\s]+(]*>)[\s]*/i, '* ') # list not followed by a newline txt.gsub!(/<\/li>[\s]*(?![\n])/i, "\n") # paragraphs and line breaks txt.gsub!(/<\/p>/i, "\n\n") txt.gsub!(//i, "\n") # strip remaining tags txt.gsub!(/<\/?[^>]*>/, '') # decode HTML entities he = HTMLEntities.new txt = he.decode(txt) # no more than two consecutive spaces txt.gsub!(/ {2,}/, " ") txt = word_wrap(txt, line_length) # remove linefeeds (\r\n and \r -> \n) txt.gsub!(/\r\n?/, "\n") # strip extra spaces txt.gsub!(/[ \t]*\302\240+[ \t]*/, " ") # non-breaking spaces -> spaces txt.gsub!(/\n[ \t]+/, "\n") # space at start of lines txt.gsub!(/[ \t]+\n/, "\n") # space at end of lines # no more than two consecutive newlines txt.gsub!(/[\n]{3,}/, "\n\n") # the word messes up the parens txt.gsub!(/\(([ \n])(http[^)]+)([\n ])\)/) do |s| ($1 == "\n" ? $1 : '' ) + '( ' + $2 + ' )' + ($3 == "\n" ? $1 : '' ) end txt.strip end # Taken from Rails' word_wrap helper (http://api.rubyonrails.org/classes/ActionView/Helpers/TextHelper.html#method-i-word_wrap) def self.word_wrap(text, line_length) text.split("\n").collect do |line| line.length > line_length ? line.gsub(/(.{1,#{line_length}})(\s+|$)/, "\\1\n").strip : line end * "\n" end end end