mirror of
https://github.com/indentlabs/notebook.git
synced 2025-10-26 11:19:22 +00:00
117 lines
3.7 KiB
Ruby
117 lines
3.7 KiB
Ruby
module Documents
|
|
class PlaintextService < Service
|
|
PLAINTEXT_LINES_PER_PAGE = 25
|
|
|
|
# From https://github.com/alexdunae/premailer/blob/master/lib/premailer/html_to_plain_text.rb
|
|
def self.from_html(html, line_length = 80, from_charset = 'UTF-8')
|
|
return "" if html.nil?
|
|
|
|
txt = html.dup
|
|
|
|
# strip text ignored html. Useful for removing
|
|
# headers and footers that aren't needed in the
|
|
# text version
|
|
txt.gsub!(/<!-- start text\/html -->.*?<!-- end text\/html -->/m, '')
|
|
|
|
# replace images with their alt attributes
|
|
# for img tags with "" for attribute quotes
|
|
# with or without closing tag
|
|
# eg. the following formats:
|
|
# <img alt="" />
|
|
# <img alt="">
|
|
txt.gsub!(/<img.+?alt=\"([^\"]*)\"[^>]*\>/i, '\1')
|
|
|
|
# for img tags with '' for attribute quotes
|
|
# with or without closing tag
|
|
# eg. the following formats:
|
|
# <img alt='' />
|
|
# <img alt=''>
|
|
txt.gsub!(/<img.+?alt=\'([^\']*)\'[^>]*\>/i, '\1')
|
|
|
|
# links
|
|
txt.gsub!(/<a\s.*?href=["'](mailto:)?([^"']*)["'][^>]*>((.|\s)*?)<\/a>/i) do |s|
|
|
if $3.empty?
|
|
''
|
|
else
|
|
$3.strip + ' ( ' + $2.strip + ' )'
|
|
end
|
|
end
|
|
|
|
# handle headings (H1-H6)
|
|
txt.gsub!(/(<\/h[1-6]>)/i, "\n\\1") # move closing tags to new lines
|
|
txt.gsub!(/[\s]*<h([1-6]+)[^>]*>[\s]*(.*)[\s]*<\/h[1-6]+>/i) do |s|
|
|
hlevel = $1.to_i
|
|
|
|
htext = $2
|
|
htext.gsub!(/<br[\s]*\/?>/i, "\n") # handle <br>s
|
|
htext.gsub!(/<\/?[^>]*>/i, '') # strip tags
|
|
|
|
# determine maximum line length
|
|
hlength = 0
|
|
htext.each_line { |l| llength = l.strip.length; hlength = llength if llength > hlength }
|
|
hlength = line_length if hlength > line_length
|
|
|
|
case hlevel
|
|
when 1 # H1, asterisks above and below
|
|
htext = ('*' * hlength) + "\n" + htext + "\n" + ('*' * hlength)
|
|
when 2 # H1, dashes above and below
|
|
htext = ('-' * hlength) + "\n" + htext + "\n" + ('-' * hlength)
|
|
else # H3-H6, dashes below
|
|
htext = htext + "\n" + ('-' * hlength)
|
|
end
|
|
|
|
"\n\n" + htext + "\n\n"
|
|
end
|
|
|
|
# wrap spans
|
|
txt.gsub!(/(<\/span>)[\s]+(<span)/mi, '\1 \2')
|
|
|
|
# lists -- TODO: should handle ordered lists
|
|
txt.gsub!(/[\s]*(<li[^>]*>)[\s]*/i, '* ')
|
|
# list not followed by a newline
|
|
txt.gsub!(/<\/li>[\s]*(?![\n])/i, "\n")
|
|
|
|
# paragraphs and line breaks
|
|
txt.gsub!(/<\/p>/i, "\n\n")
|
|
txt.gsub!(/<br[\/ ]*>/i, "\n")
|
|
|
|
# strip remaining tags
|
|
txt.gsub!(/<\/?[^>]*>/, '')
|
|
|
|
# decode HTML entities
|
|
he = HTMLEntities.new
|
|
txt = he.decode(txt)
|
|
|
|
# no more than two consecutive spaces
|
|
txt.gsub!(/ {2,}/, " ")
|
|
|
|
txt = word_wrap(txt, line_length)
|
|
|
|
# remove linefeeds (\r\n and \r -> \n)
|
|
txt.gsub!(/\r\n?/, "\n")
|
|
|
|
# strip extra spaces
|
|
txt.gsub!(/[ \t]*\302\240+[ \t]*/, " ") # non-breaking spaces -> spaces
|
|
txt.gsub!(/\n[ \t]+/, "\n") # space at start of lines
|
|
txt.gsub!(/[ \t]+\n/, "\n") # space at end of lines
|
|
|
|
# no more than two consecutive newlines
|
|
txt.gsub!(/[\n]{3,}/, "\n\n")
|
|
|
|
# the word messes up the parens
|
|
txt.gsub!(/\(([ \n])(http[^)]+)([\n ])\)/) do |s|
|
|
($1 == "\n" ? $1 : '' ) + '( ' + $2 + ' )' + ($3 == "\n" ? $1 : '' )
|
|
end
|
|
|
|
txt.strip
|
|
end
|
|
|
|
# Taken from Rails' word_wrap helper (http://api.rubyonrails.org/classes/ActionView/Helpers/TextHelper.html#method-i-word_wrap)
|
|
def self.word_wrap(text, line_length)
|
|
text.split("\n").collect do |line|
|
|
line.length > line_length ? line.gsub(/(.{1,#{line_length}})(\s+|$)/, "\\1\n").strip : line
|
|
end * "\n"
|
|
end
|
|
end
|
|
end
|