use 3rd party lib for word counts

This commit is contained in:
Andrew Brown 2021-06-30 17:41:22 -07:00
parent 0b2a5a6e6c
commit eae59ef451
3 changed files with 22 additions and 1 deletions

View File

@ -136,6 +136,7 @@ group :worker do
# Document understanding
gem 'htmlentities'
gem 'birch', git: 'https://github.com/billthompson/birch.git', branch: 'birch-ruby22'
gem 'word_count_analyzer'
gem 'engtagger'
gem 'ibm_watson'

View File

@ -1545,6 +1545,8 @@ GEM
websocket-driver (0.7.5)
websocket-extensions (>= 0.1.0)
websocket-extensions (0.1.5)
word_count_analyzer (1.0.1)
engtagger
zeitwerk (2.4.2)
PLATFORMS
@ -1623,6 +1625,7 @@ DEPENDENCIES
uglifier (>= 1.3.0)
web-console
webpacker
word_count_analyzer
RUBY VERSION
ruby 2.7.2p137

View File

@ -81,7 +81,24 @@ class Document < ApplicationRecord
end
def computed_word_count
(self.body || "").scan(/[\w-]+/).size
return 0 unless self.body && self.body.present?
WordCountAnalyzer::Counter.new(
ellipsis: 'no_special_treatment',
hyperlink: 'no_special_treatment',
contraction: 'count_as_multiple',
hyphenated_word: 'count_as_multiple',
date: 'count_as_one',
number: 'ignore',
numbered_list: 'ignore',
xhtml: 'keep',
forward_slash: 'count_as_multiple',
backslash: 'count_as_multiple',
dotted_line: 'count',
dashed_line: 'count',
underscore: 'count',
stray_punctuation: 'count'
).count(self.body)
end
def reading_estimate