From eae59ef451ebb53cb6f36cf05fe2e05709805d5d Mon Sep 17 00:00:00 2001 From: Andrew Brown Date: Wed, 30 Jun 2021 17:41:22 -0700 Subject: [PATCH] use 3rd party lib for word counts --- Gemfile | 1 + Gemfile.lock | 3 +++ app/models/documents/document.rb | 19 ++++++++++++++++++- 3 files changed, 22 insertions(+), 1 deletion(-) diff --git a/Gemfile b/Gemfile index 1cc47978..c3fc14ad 100644 --- a/Gemfile +++ b/Gemfile @@ -136,6 +136,7 @@ group :worker do # Document understanding gem 'htmlentities' gem 'birch', git: 'https://github.com/billthompson/birch.git', branch: 'birch-ruby22' + gem 'word_count_analyzer' gem 'engtagger' gem 'ibm_watson' diff --git a/Gemfile.lock b/Gemfile.lock index e28051b4..86a9478f 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1545,6 +1545,8 @@ GEM websocket-driver (0.7.5) websocket-extensions (>= 0.1.0) websocket-extensions (0.1.5) + word_count_analyzer (1.0.1) + engtagger zeitwerk (2.4.2) PLATFORMS @@ -1623,6 +1625,7 @@ DEPENDENCIES uglifier (>= 1.3.0) web-console webpacker + word_count_analyzer RUBY VERSION ruby 2.7.2p137 diff --git a/app/models/documents/document.rb b/app/models/documents/document.rb index 0e0905d4..2fc3cefe 100644 --- a/app/models/documents/document.rb +++ b/app/models/documents/document.rb @@ -81,7 +81,24 @@ class Document < ApplicationRecord end def computed_word_count - (self.body || "").scan(/[\w-]+/).size + return 0 unless self.body && self.body.present? + + WordCountAnalyzer::Counter.new( + ellipsis: 'no_special_treatment', + hyperlink: 'no_special_treatment', + contraction: 'count_as_multiple', + hyphenated_word: 'count_as_multiple', + date: 'count_as_one', + number: 'ignore', + numbered_list: 'ignore', + xhtml: 'keep', + forward_slash: 'count_as_multiple', + backslash: 'count_as_multiple', + dotted_line: 'count', + dashed_line: 'count', + underscore: 'count', + stray_punctuation: 'count' + ).count(self.body) end def reading_estimate