notebook/app/services/documents/plaintext_service.rb

module Documents
  class PlaintextService < Service
    PLAINTEXT_LINES_PER_PAGE = 25

    # From https://github.com/alexdunae/premailer/blob/master/lib/premailer/html_to_plain_text.rb
    def self.from_html(html, line_length = 80, from_charset = 'UTF-8')
      return "" if html.nil?

      txt = html.dup

      # strip text ignored html. Useful for removing
      # headers and footers that aren't needed in the
      # text version
      txt.gsub!(/<!-- start text\/html -->.*?<!-- end text\/html -->/m, '')

      # replace images with their alt attributes
      # for img tags with "" for attribute quotes
      # with or without closing tag
      # eg. the following formats:
      # <img alt="" />
      # <img alt="">
      txt.gsub!(/<img.+?alt=\"([^\"]*)\"[^>]*\>/i, '\1')

      # for img tags with '' for attribute quotes
      # with or without closing tag
      # eg. the following formats:
      # <img alt='' />
      # <img alt=''>
      txt.gsub!(/<img.+?alt=\'([^\']*)\'[^>]*\>/i, '\1')

      # links
      txt.gsub!(/<a\s.*?href=["'](mailto:)?([^"']*)["'][^>]*>((.|\s)*?)<\/a>/i) do |s|
        if $3.empty?
          ''
        else
          $3.strip + ' ( ' + $2.strip + ' )'
        end
      end

      # handle headings (H1-H6)
      txt.gsub!(/(<\/h[1-6]>)/i, "\n\\1") # move closing tags to new lines
      txt.gsub!(/[\s]*<h([1-6]+)[^>]*>[\s]*(.*)[\s]*<\/h[1-6]+>/i) do |s|
        hlevel = $1.to_i

        htext = $2
        htext.gsub!(/<br[\s]*\/?>/i, "\n") # handle <br>s
        htext.gsub!(/<\/?[^>]*>/i, '') # strip tags

        # determine maximum line length
        hlength = 0
        htext.each_line { |l| llength = l.strip.length; hlength = llength if llength > hlength }
        hlength = line_length if hlength > line_length

        case hlevel
          when 1   # H1, asterisks above and below
            htext = ('*' * hlength) + "\n" + htext + "\n" + ('*' * hlength)
          when 2   # H1, dashes above and below
            htext = ('-' * hlength) + "\n" + htext + "\n" + ('-' * hlength)
          else     # H3-H6, dashes below
            htext = htext + "\n" + ('-' * hlength)
        end

        "\n\n" + htext + "\n\n"
      end

      # wrap spans
      txt.gsub!(/(<\/span>)[\s]+(<span)/mi, '\1 \2')

      # lists -- TODO: should handle ordered lists
      txt.gsub!(/[\s]*(<li[^>]*>)[\s]*/i, '* ')
      # list not followed by a newline
      txt.gsub!(/<\/li>[\s]*(?![\n])/i, "\n")

      # paragraphs and line breaks
      txt.gsub!(/<\/p>/i, "\n\n")
      txt.gsub!(/<br[\/ ]*>/i, "\n")

      # strip remaining tags
      txt.gsub!(/<\/?[^>]*>/, '')

      # decode HTML entities
      he = HTMLEntities.new
      txt = he.decode(txt)

      # no more than two consecutive spaces
      txt.gsub!(/ {2,}/, " ")

      txt = word_wrap(txt, line_length)

      # remove linefeeds (\r\n and \r -> \n)
      txt.gsub!(/\r\n?/, "\n")

      # strip extra spaces
      txt.gsub!(/[ \t]*\302\240+[ \t]*/, " ") # non-breaking spaces -> spaces
      txt.gsub!(/\n[ \t]+/, "\n") # space at start of lines
      txt.gsub!(/[ \t]+\n/, "\n") # space at end of lines

      # no more than two consecutive newlines
      txt.gsub!(/[\n]{3,}/, "\n\n")

      # the word messes up the parens
      txt.gsub!(/\(([ \n])(http[^)]+)([\n ])\)/) do |s|
        ($1 == "\n" ? $1 : '' ) + '( ' + $2 + ' )' + ($3 == "\n" ? $1 : '' )
      end

      txt.strip
    end

    # Taken from Rails' word_wrap helper (http://api.rubyonrails.org/classes/ActionView/Helpers/TextHelper.html#method-i-word_wrap)
    def self.word_wrap(text, line_length)
      text.split("\n").collect do |line|
        line.length > line_length ? line.gsub(/(.{1,#{line_length}})(\s+|$)/, "\\1\n").strip : line
      end * "\n"
    end
  end
end