mirror of
https://github.com/indentlabs/notebook.git
synced 2025-10-26 11:19:22 +00:00
90 lines
3.5 KiB
Ruby
90 lines
3.5 KiB
Ruby
namespace :backfill do
|
|
desc "Backfill cached word counts on all attributes"
|
|
task attribute_word_count_caches: :environment do
|
|
Attribute.where(word_count_cache: nil).where.not(value: ["", " ", ".", nil]).find_each do |attribute|
|
|
word_count = WordCountAnalyzer::Counter.new(
|
|
ellipsis: 'no_special_treatment',
|
|
hyperlink: 'count_as_one',
|
|
contraction: 'count_as_one',
|
|
hyphenated_word: 'count_as_one',
|
|
date: 'no_special_treatment',
|
|
number: 'count',
|
|
numbered_list: 'ignore',
|
|
xhtml: 'remove',
|
|
forward_slash: 'count_as_multiple_except_dates',
|
|
backslash: 'count_as_one',
|
|
dotted_line: 'ignore',
|
|
dashed_line: 'ignore',
|
|
underscore: 'ignore',
|
|
stray_punctuation: 'ignore'
|
|
).count(attribute.value)
|
|
|
|
attribute.update_column(:word_count_cache, word_count)
|
|
end
|
|
end
|
|
|
|
task most_used_attribute_word_counts: :environment do
|
|
word_counts = {}
|
|
Attribute.where(word_count_cache: nil).group(:value).order('count_id DESC').limit(500).count(:id).each do |value, count|
|
|
word_count = WordCountAnalyzer::Counter.new(
|
|
ellipsis: 'no_special_treatment',
|
|
hyperlink: 'count_as_one',
|
|
contraction: 'count_as_one',
|
|
hyphenated_word: 'count_as_one',
|
|
date: 'no_special_treatment',
|
|
number: 'count',
|
|
numbered_list: 'ignore',
|
|
xhtml: 'remove',
|
|
forward_slash: 'count_as_multiple_except_dates',
|
|
backslash: 'count_as_one',
|
|
dotted_line: 'ignore',
|
|
dashed_line: 'ignore',
|
|
underscore: 'ignore',
|
|
stray_punctuation: 'ignore'
|
|
).count(value)
|
|
|
|
word_counts[word_count] ||= []
|
|
word_counts[word_count].push value
|
|
puts "#{value} x #{count}: #{word_count} words"
|
|
end
|
|
|
|
word_counts.each do |count, values|
|
|
Attribute.where(word_count_cache: nil, value: values).update_all(word_count_cache: count)
|
|
end
|
|
end
|
|
|
|
desc "Backfill cached word counts on all documents"
|
|
task document_word_count_caches: :environment do
|
|
Document.with_deleted.where(cached_word_count: nil).where.not(body: [nil, ""]).find_each(batch_size: 500) do |document|
|
|
document.update_column(:cached_word_count, document.computed_word_count)
|
|
puts document.id
|
|
end
|
|
end
|
|
|
|
desc "Start working through old categories/fields without position set"
|
|
task sortables_positions: :environment do
|
|
categories_to_position = AttributeCategory.where(position: nil).order("RANDOM()").limit(500).to_a
|
|
|
|
puts "Empty position backlog:\n\t* #{AttributeCategory.where(position: nil).count} categories\n\t* #{AttributeField.where(position: nil).count} fields"
|
|
|
|
while categories_to_position.any?
|
|
category = categories_to_position.pop
|
|
|
|
# Backfill all the positioning for this category's page's categories
|
|
category.backfill_categories_ordering!
|
|
|
|
# We can skip this if we're just backfilling with a single worker,
|
|
# but in case we're backfilling on multiple this fetches a recent
|
|
# copy of updates before proceeding. Technically still a possibility
|
|
# of Doing The Same Thing Twice, but a smaller possibility.
|
|
|
|
if rand(100) < 20
|
|
puts "Empty position backlog:\n\t* #{AttributeCategory.where(position: nil).count} categories\n\t* #{AttributeField.where(position: nil).count} fields"
|
|
end
|
|
end
|
|
|
|
puts "Done!"
|
|
end
|
|
end
|
|
|