From cc0a55cf9aead00e1cb044649f84c2187e0e4a35 Mon Sep 17 00:00:00 2001 From: Eugen Rochko Date: Sun, 18 Aug 2019 03:45:51 +0200 Subject: [PATCH] Add more accurate hashtag search (#11579) * Add more accurate hashtag search Using ElasticSearch to index hashtags with edge n-grams and score them by usage within the last 7 days since last activity. Only hashtags that have been reviewed and are listable can appear in searches, unless they match the query exactly * Fix search analyzer dropping non-ascii characters --- app/chewy/tags_index.rb | 37 +++++++++ app/models/tag.rb | 14 +++- app/models/trending_tags.rb | 3 + app/services/account_search_service.rb | 2 +- app/services/search_service.rb | 8 +- app/services/tag_search_service.rb | 82 +++++++++++++++++++ config/locales/simple_form.en.yml | 2 +- ...190815225426_add_last_status_at_to_tags.rb | 6 ++ db/schema.rb | 4 +- spec/models/tag_spec.rb | 4 +- 10 files changed, 149 insertions(+), 13 deletions(-) create mode 100644 app/chewy/tags_index.rb create mode 100644 app/services/tag_search_service.rb create mode 100644 db/migrate/20190815225426_add_last_status_at_to_tags.rb diff --git a/app/chewy/tags_index.rb b/app/chewy/tags_index.rb new file mode 100644 index 00000000000..300fc128f63 --- /dev/null +++ b/app/chewy/tags_index.rb @@ -0,0 +1,37 @@ +# frozen_string_literal: true + +class TagsIndex < Chewy::Index + settings index: { refresh_interval: '15m' }, analysis: { + analyzer: { + content: { + tokenizer: 'keyword', + filter: %w(lowercase asciifolding cjk_width), + }, + + edge_ngram: { + tokenizer: 'edge_ngram', + filter: %w(lowercase asciifolding cjk_width), + }, + }, + + tokenizer: { + edge_ngram: { + type: 'edge_ngram', + min_gram: 2, + max_gram: 15, + }, + }, + } + + define_type ::Tag.listable, delete_if: ->(tag) { tag.destroyed? || !tag.listable? } do + root date_detection: false do + field :name, type: 'text', analyzer: 'content' do + field :edge_ngram, type: 'text', analyzer: 'edge_ngram', search_analyzer: 'content' + end + + field :reviewed, type: 'boolean', value: ->(tag) { tag.reviewed? } + field :usage, type: 'long', value: ->(tag) { tag.history.reduce(0) { |total, day| total + day[:accounts].to_i } } + field :last_status_at, type: 'date', value: ->(tag) { tag.last_status_at || tag.created_at } + end + end +end diff --git a/app/models/tag.rb b/app/models/tag.rb index 1364d1dba68..5094d973db0 100644 --- a/app/models/tag.rb +++ b/app/models/tag.rb @@ -13,6 +13,8 @@ # listable :boolean # reviewed_at :datetime # requested_review_at :datetime +# last_status_at :datetime +# last_trend_at :datetime # class Tag < ApplicationRecord @@ -33,7 +35,8 @@ class Tag < ApplicationRecord scope :unreviewed, -> { where(reviewed_at: nil) } scope :pending_review, -> { unreviewed.where.not(requested_review_at: nil) } scope :usable, -> { where(usable: [true, nil]) } - scope :discoverable, -> { where(listable: [true, nil]).joins(:account_tag_stat).where(AccountTagStat.arel_table[:accounts_count].gt(0)).order(Arel.sql('account_tag_stats.accounts_count desc')) } + scope :listable, -> { where(listable: [true, nil]) } + scope :discoverable, -> { listable.joins(:account_tag_stat).where(AccountTagStat.arel_table[:accounts_count].gt(0)).order(Arel.sql('account_tag_stats.accounts_count desc')) } scope :most_used, ->(account) { joins(:statuses).where(statuses: { account: account }).group(:id).order(Arel.sql('count(*) desc')) } delegate :accounts_count, @@ -44,6 +47,8 @@ class Tag < ApplicationRecord after_save :save_account_tag_stat + update_index('tags#tag', :self) if Chewy.enabled? + def account_tag_stat super || build_account_tag_stat end @@ -121,9 +126,10 @@ class Tag < ApplicationRecord normalized_term = normalize(term.strip).mb_chars.downcase.to_s pattern = sanitize_sql_like(normalized_term) + '%' - Tag.where(arel_table[:name].lower.matches(pattern)) - .where(arel_table[:score].gt(0).or(arel_table[:name].lower.eq(normalized_term))) - .order(Arel.sql('length(name) ASC, score DESC, name ASC')) + Tag.listable + .where(arel_table[:name].lower.matches(pattern)) + .where(arel_table[:name].lower.eq(normalized_term).or(arel_table[:reviewed_at].not_eq(nil))) + .order(Arel.sql('length(name) ASC, name ASC')) .limit(limit) .offset(offset) end diff --git a/app/models/trending_tags.rb b/app/models/trending_tags.rb index 3d60a7fea36..e4ce988c183 100644 --- a/app/models/trending_tags.rb +++ b/app/models/trending_tags.rb @@ -17,6 +17,9 @@ class TrendingTags increment_historical_use!(tag.id, at_time) increment_unique_use!(tag.id, account.id, at_time) increment_vote!(tag, at_time) + + tag.update(last_status_at: Time.now.utc) if tag.last_status_at.nil? || tag.last_status_at < 12.hours.ago + tag.update(last_trend_at: Time.now.utc) if trending?(tag) && (tag.last_trend_at.nil? || tag.last_trend_at < 12.hours.ago) end def get(limit, filtered: true) diff --git a/app/services/account_search_service.rb b/app/services/account_search_service.rb index d7bccdfe031..01caaefa94f 100644 --- a/app/services/account_search_service.rb +++ b/app/services/account_search_service.rb @@ -109,7 +109,7 @@ class AccountSearchService < BaseService field_value_factor: { field: 'followers_count', modifier: 'log2p', - missing: 1, + missing: 0, }, } end diff --git a/app/services/search_service.rb b/app/services/search_service.rb index 786d34b152f..fe601bbf4b9 100644 --- a/app/services/search_service.rb +++ b/app/services/search_service.rb @@ -57,10 +57,10 @@ class SearchService < BaseService end def perform_hashtags_search! - Tag.search_for( - @query.gsub(/\A#/, ''), - @limit, - @offset + TagSearchService.new.call( + @query, + limit: @limit, + offset: @offset ) end diff --git a/app/services/tag_search_service.rb b/app/services/tag_search_service.rb new file mode 100644 index 00000000000..64dd76bb778 --- /dev/null +++ b/app/services/tag_search_service.rb @@ -0,0 +1,82 @@ +# frozen_string_literal: true + +class TagSearchService < BaseService + def call(query, options = {}) + @query = query.strip.gsub(/\A#/, '') + @offset = options[:offset].to_i + @limit = options[:limit].to_i + + if Chewy.enabled? + from_elasticsearch + else + from_database + end + end + + private + + def from_elasticsearch + query = { + function_score: { + query: { + multi_match: { + query: @query, + fields: %w(name.edge_ngram name), + type: 'most_fields', + operator: 'and', + }, + }, + + functions: [ + { + field_value_factor: { + field: 'usage', + modifier: 'log2p', + missing: 0, + }, + }, + + { + gauss: { + last_status_at: { + scale: '7d', + offset: '14d', + decay: 0.5, + }, + }, + }, + ], + + boost_mode: 'multiply', + }, + } + + filter = { + bool: { + should: [ + { + term: { + reviewed: { + value: true, + }, + }, + }, + + { + term: { + name: { + value: @query, + }, + }, + }, + ], + }, + } + + TagsIndex.query(query).filter(filter).limit(@limit).offset(@offset).objects.compact + end + + def from_database + Tag.search_for(@query, @limit, @offset) + end +end diff --git a/config/locales/simple_form.en.yml b/config/locales/simple_form.en.yml index e15d5904fd7..98f0843d03b 100644 --- a/config/locales/simple_form.en.yml +++ b/config/locales/simple_form.en.yml @@ -142,7 +142,7 @@ en: report: Send e-mail when a new report is submitted trending_tag: Send e-mail when an unreviewed hashtag is trending tag: - listable: Allow this hashtag to appear on the profile directory + listable: Allow this hashtag to appear in searches and on the profile directory trendable: Allow this hashtag to appear under trends usable: Allow toots to use this hashtag 'no': 'No' diff --git a/db/migrate/20190815225426_add_last_status_at_to_tags.rb b/db/migrate/20190815225426_add_last_status_at_to_tags.rb new file mode 100644 index 00000000000..d83537c47f9 --- /dev/null +++ b/db/migrate/20190815225426_add_last_status_at_to_tags.rb @@ -0,0 +1,6 @@ +class AddLastStatusAtToTags < ActiveRecord::Migration[5.2] + def change + add_column :tags, :last_status_at, :datetime + add_column :tags, :last_trend_at, :datetime + end +end diff --git a/db/schema.rb b/db/schema.rb index f8fc6a821db..0da20d62f53 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -10,7 +10,7 @@ # # It's strongly recommended that you check this file into your version control system. -ActiveRecord::Schema.define(version: 2019_08_07_135426) do +ActiveRecord::Schema.define(version: 2019_08_15_225426) do # These are extensions that must be enabled in order to support this database enable_extension "plpgsql" @@ -667,6 +667,8 @@ ActiveRecord::Schema.define(version: 2019_08_07_135426) do t.boolean "listable" t.datetime "reviewed_at" t.datetime "requested_review_at" + t.datetime "last_status_at" + t.datetime "last_trend_at" t.index "lower((name)::text)", name: "index_tags_on_name_lower", unique: true end diff --git a/spec/models/tag_spec.rb b/spec/models/tag_spec.rb index 9d700849bb8..2bb30fb57f7 100644 --- a/spec/models/tag_spec.rb +++ b/spec/models/tag_spec.rb @@ -136,8 +136,8 @@ RSpec.describe Tag, type: :model do end it 'finds the exact matching tag as the first item' do - similar_tag = Fabricate(:tag, name: "matchlater", score: 1) - tag = Fabricate(:tag, name: "match", score: 1) + similar_tag = Fabricate(:tag, name: "matchlater", reviewed_at: Time.now.utc) + tag = Fabricate(:tag, name: "match", reviewed_at: Time.now.utc) results = Tag.search_for("match")