From 619fad6cf8078ea997554081febe850404bee73c Mon Sep 17 00:00:00 2001 From: Eugen Rochko Date: Sun, 11 Apr 2021 11:22:50 +0200 Subject: [PATCH] Remove spam check and dependency on nilsimsa gem (#16011) --- Gemfile | 1 - Gemfile.lock | 8 - app/controllers/admin/dashboard_controller.rb | 1 - app/lib/activitypub/activity/create.rb | 5 - app/lib/spam_check.rb | 198 ------------------ app/models/form/admin_settings.rb | 2 - app/services/process_mentions_service.rb | 5 - app/services/remove_status_service.rb | 5 - app/views/admin/dashboard/index.html.haml | 2 - app/views/admin/settings/edit.html.haml | 3 - config/locales/en.yml | 6 - config/settings.yml | 1 - spec/lib/spam_check_spec.rb | 192 ----------------- 13 files changed, 429 deletions(-) delete mode 100644 app/lib/spam_check.rb delete mode 100644 spec/lib/spam_check_spec.rb diff --git a/Gemfile b/Gemfile index cb24207ca0d..1190f2558fe 100644 --- a/Gemfile +++ b/Gemfile @@ -62,7 +62,6 @@ gem 'idn-ruby', require: 'idn' gem 'kaminari', '~> 1.2' gem 'link_header', '~> 0.0' gem 'mime-types', '~> 3.3.1', require: 'mime/types/columnar' -gem 'nilsimsa', git: 'https://github.com/witgo/nilsimsa', ref: 'fd184883048b922b176939f851338d0a4971a532' gem 'nokogiri', '~> 1.11' gem 'nsa', git: 'https://github.com/Gargron/nsa', ref: 'd1079e0cdafdfed7f9f35478d13b9bdaa65965c0' gem 'oj', '~> 3.11' diff --git a/Gemfile.lock b/Gemfile.lock index 11765a96714..8bea313320e 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -18,13 +18,6 @@ GIT activerecord (>= 6.1.0) activesupport (>= 6.1.0) -GIT - remote: https://github.com/witgo/nilsimsa - revision: fd184883048b922b176939f851338d0a4971a532 - ref: fd184883048b922b176939f851338d0a4971a532 - specs: - nilsimsa (1.1.2) - GEM remote: https://rubygems.org/ specs: @@ -762,7 +755,6 @@ DEPENDENCIES microformats (~> 4.2) mime-types (~> 3.3.1) net-ldap (~> 0.17) - nilsimsa! nokogiri (~> 1.11) nsa! oj (~> 3.11) diff --git a/app/controllers/admin/dashboard_controller.rb b/app/controllers/admin/dashboard_controller.rb index 4422825eec8..c829ed98f87 100644 --- a/app/controllers/admin/dashboard_controller.rb +++ b/app/controllers/admin/dashboard_controller.rb @@ -35,7 +35,6 @@ module Admin @whitelist_enabled = whitelist_mode? @profile_directory = Setting.profile_directory @timeline_preview = Setting.timeline_preview - @spam_check_enabled = Setting.spam_check_enabled @trends_enabled = Setting.trends end diff --git a/app/lib/activitypub/activity/create.rb b/app/lib/activitypub/activity/create.rb index 0fa306cddd0..9f6dd9ce027 100644 --- a/app/lib/activitypub/activity/create.rb +++ b/app/lib/activitypub/activity/create.rb @@ -88,7 +88,6 @@ class ActivityPub::Activity::Create < ActivityPub::Activity resolve_thread(@status) fetch_replies(@status) - check_for_spam distribute(@status) forward_for_reply end @@ -492,10 +491,6 @@ class ActivityPub::Activity::Create < ActivityPub::Activity Tombstone.exists?(uri: object_uri) end - def check_for_spam - SpamCheck.perform(@status) - end - def forward_for_reply return unless @status.distributable? && @json['signature'].present? && reply_to_local? diff --git a/app/lib/spam_check.rb b/app/lib/spam_check.rb deleted file mode 100644 index dcb2db9ca92..00000000000 --- a/app/lib/spam_check.rb +++ /dev/null @@ -1,198 +0,0 @@ -# frozen_string_literal: true - -class SpamCheck - include Redisable - include ActionView::Helpers::TextHelper - - # Threshold over which two Nilsimsa values are considered - # to refer to the same text - NILSIMSA_COMPARE_THRESHOLD = 95 - - # Nilsimsa doesn't work well on small inputs, so below - # this size, we check only for exact matches with MD5 - NILSIMSA_MIN_SIZE = 10 - - # How long to keep the trail of digests between updates, - # there is no reason to store it forever - EXPIRE_SET_AFTER = 1.week.seconds - - # How many digests to keep in an account's trail. If it's - # too small, spam could rotate around different message templates - MAX_TRAIL_SIZE = 10 - - # How many detected duplicates to allow through before - # considering the message as spam - THRESHOLD = 5 - - def initialize(status) - @account = status.account - @status = status - end - - def skip? - disabled? || already_flagged? || trusted? || no_unsolicited_mentions? || solicited_reply? - end - - def spam? - if insufficient_data? - false - elsif nilsimsa? - digests_over_threshold?('nilsimsa') { |_, other_digest| nilsimsa_compare_value(digest, other_digest) >= NILSIMSA_COMPARE_THRESHOLD } - else - digests_over_threshold?('md5') { |_, other_digest| other_digest == digest } - end - end - - def flag! - auto_report_status! - end - - def remember! - # The scores in sorted sets don't actually have enough bits to hold an exact - # value of our snowflake IDs, so we use it only for its ordering property. To - # get the correct status ID back, we have to save it in the string value - - redis.zadd(redis_key, @status.id, digest_with_algorithm) - redis.zremrangebyrank(redis_key, 0, -(MAX_TRAIL_SIZE + 1)) - redis.expire(redis_key, EXPIRE_SET_AFTER) - end - - def reset! - redis.del(redis_key) - end - - def hashable_text - return @hashable_text if defined?(@hashable_text) - - @hashable_text = @status.text - @hashable_text = remove_mentions(@hashable_text) - @hashable_text = strip_tags(@hashable_text) unless @status.local? - @hashable_text = normalize_unicode(@status.spoiler_text + ' ' + @hashable_text) - @hashable_text = remove_whitespace(@hashable_text) - end - - def insufficient_data? - hashable_text.blank? - end - - def digest - @digest ||= begin - if nilsimsa? - Nilsimsa.new(hashable_text).hexdigest - else - Digest::MD5.hexdigest(hashable_text) - end - end - end - - def digest_with_algorithm - if nilsimsa? - ['nilsimsa', digest, @status.id].join(':') - else - ['md5', digest, @status.id].join(':') - end - end - - class << self - def perform(status) - spam_check = new(status) - - return if spam_check.skip? - - if spam_check.spam? - spam_check.flag! - else - spam_check.remember! - end - end - end - - private - - def disabled? - !Setting.spam_check_enabled - end - - def remove_mentions(text) - return text.gsub(Account::MENTION_RE, '') if @status.local? - - Nokogiri::HTML.fragment(text).tap do |html| - mentions = @status.mentions.map { |mention| ActivityPub::TagManager.instance.url_for(mention.account) } - - html.traverse do |element| - element.unlink if element.name == 'a' && mentions.include?(element['href']) - end - end.to_s - end - - def normalize_unicode(text) - text.unicode_normalize(:nfkc).downcase - end - - def remove_whitespace(text) - text.gsub(/\s+/, ' ').strip - end - - def auto_report_status! - status_ids = Status.where(visibility: %i(public unlisted)).where(id: matching_status_ids).pluck(:id) + [@status.id] if @status.distributable? - ReportService.new.call(Account.representative, @account, status_ids: status_ids, comment: I18n.t('spam_check.spam_detected')) - end - - def already_flagged? - @account.silenced? || @account.targeted_reports.unresolved.where(account_id: -99).exists? - end - - def trusted? - @account.trust_level > Account::TRUST_LEVELS[:untrusted] || (@account.local? && @account.user_staff?) - end - - def no_unsolicited_mentions? - @status.mentions.all? { |mention| mention.silent? || (!@account.local? && !mention.account.local?) || mention.account.following?(@account) } - end - - def solicited_reply? - !@status.thread.nil? && @status.thread.mentions.where(account: @account).exists? - end - - def nilsimsa_compare_value(first, second) - first = [first].pack('H*') - second = [second].pack('H*') - bits = 0 - - 0.upto(31) do |i| - bits += Nilsimsa::POPC[255 & (first[i].ord ^ second[i].ord)].ord - end - - 128 - bits # -128 <= Nilsimsa Compare Value <= 128 - end - - def nilsimsa? - hashable_text.size > NILSIMSA_MIN_SIZE - end - - def other_digests - redis.zrange(redis_key, 0, -1) - end - - def digests_over_threshold?(filter_algorithm) - other_digests.select do |record| - algorithm, other_digest, status_id = record.split(':') - - next unless algorithm == filter_algorithm - - yield algorithm, other_digest, status_id - end.size >= THRESHOLD - end - - def matching_status_ids - if nilsimsa? - other_digests.filter_map { |record| record.split(':')[2] if record.start_with?('nilsimsa') && nilsimsa_compare_value(digest, record.split(':')[1]) >= NILSIMSA_COMPARE_THRESHOLD } - else - other_digests.filter_map { |record| record.split(':')[2] if record.start_with?('md5') && record.split(':')[1] == digest } - end - end - - def redis_key - @redis_key ||= "spam_check:#{@account.id}" - end -end diff --git a/app/models/form/admin_settings.rb b/app/models/form/admin_settings.rb index e9f78da2123..b5c3dcdbe89 100644 --- a/app/models/form/admin_settings.rb +++ b/app/models/form/admin_settings.rb @@ -29,7 +29,6 @@ class Form::AdminSettings thumbnail hero mascot - spam_check_enabled trends trendable_by_default show_domain_blocks @@ -48,7 +47,6 @@ class Form::AdminSettings show_known_fediverse_at_about_page preview_sensitive_media profile_directory - spam_check_enabled trends trendable_by_default noindex diff --git a/app/services/process_mentions_service.rb b/app/services/process_mentions_service.rb index 4c02c786527..73dbb18345a 100644 --- a/app/services/process_mentions_service.rb +++ b/app/services/process_mentions_service.rb @@ -43,7 +43,6 @@ class ProcessMentionsService < BaseService end status.save! - check_for_spam(status) mentions.each { |mention| create_notification(mention) } end @@ -72,8 +71,4 @@ class ProcessMentionsService < BaseService def resolve_account_service ResolveAccountService.new end - - def check_for_spam(status) - SpamCheck.perform(status) - end end diff --git a/app/services/remove_status_service.rb b/app/services/remove_status_service.rb index d6043fb5d80..d642beeaae5 100644 --- a/app/services/remove_status_service.rb +++ b/app/services/remove_status_service.rb @@ -41,7 +41,6 @@ class RemoveStatusService < BaseService remove_from_hashtags remove_from_public remove_from_media if @status.media_attachments.any? - remove_from_spam_check remove_media end @@ -163,10 +162,6 @@ class RemoveStatusService < BaseService @status.media_attachments.destroy_all end - def remove_from_spam_check - redis.zremrangebyscore("spam_check:#{@status.account_id}", @status.id, @status.id) - end - def lock_options { redis: Redis.current, key: "distribute:#{@status.id}" } end diff --git a/app/views/admin/dashboard/index.html.haml b/app/views/admin/dashboard/index.html.haml index 2055384029e..57a753e6b6d 100644 --- a/app/views/admin/dashboard/index.html.haml +++ b/app/views/admin/dashboard/index.html.haml @@ -77,8 +77,6 @@ = feature_hint(link_to(t('admin.dashboard.trends'), edit_admin_settings_path), @trends_enabled) %li = feature_hint(link_to(t('admin.dashboard.feature_relay'), admin_relays_path), @relay_enabled) - %li - = feature_hint(link_to(t('admin.dashboard.feature_spam_check'), edit_admin_settings_path), @spam_check_enabled) .dashboard__widgets__versions %div diff --git a/app/views/admin/settings/edit.html.haml b/app/views/admin/settings/edit.html.haml index 159bd4b0a0d..0e705f20549 100644 --- a/app/views/admin/settings/edit.html.haml +++ b/app/views/admin/settings/edit.html.haml @@ -92,9 +92,6 @@ .fields-group = f.input :noindex, as: :boolean, wrapper: :with_label, label: t('admin.settings.default_noindex.title'), hint: t('admin.settings.default_noindex.desc_html') - .fields-group - = f.input :spam_check_enabled, as: :boolean, wrapper: :with_label, label: t('admin.settings.spam_check_enabled.title'), hint: t('admin.settings.spam_check_enabled.desc_html') - %hr.spacer/ .fields-group diff --git a/config/locales/en.yml b/config/locales/en.yml index 182a8e98507..3387b4df6a5 100644 --- a/config/locales/en.yml +++ b/config/locales/en.yml @@ -363,7 +363,6 @@ en: feature_profile_directory: Profile directory feature_registrations: Registrations feature_relay: Federation relay - feature_spam_check: Anti-spam feature_timeline_preview: Timeline preview features: Features hidden_service: Federation with hidden services @@ -627,9 +626,6 @@ en: desc_html: You can write your own privacy policy, terms of service or other legalese. You can use HTML tags title: Custom terms of service site_title: Server name - spam_check_enabled: - desc_html: Mastodon can auto-report accounts that send repeated unsolicited messages. There may be false positives. - title: Anti-spam automation thumbnail: desc_html: Used for previews via OpenGraph and API. 1200x630px recommended title: Server thumbnail @@ -1209,8 +1205,6 @@ en: relationships: Follows and followers two_factor_authentication: Two-factor Auth webauthn_authentication: Security keys - spam_check: - spam_detected: This is an automated report. Spam has been detected. statuses: attached: audio: diff --git a/config/settings.yml b/config/settings.yml index 9cf68a09663..b79ea620c80 100644 --- a/config/settings.yml +++ b/config/settings.yml @@ -67,7 +67,6 @@ defaults: &defaults activity_api_enabled: true peers_api_enabled: true show_known_fediverse_at_about_page: true - spam_check_enabled: true show_domain_blocks: 'disabled' show_domain_blocks_rationale: 'disabled' require_invite_text: false diff --git a/spec/lib/spam_check_spec.rb b/spec/lib/spam_check_spec.rb deleted file mode 100644 index 159d8325735..00000000000 --- a/spec/lib/spam_check_spec.rb +++ /dev/null @@ -1,192 +0,0 @@ -# frozen_string_literal: true - -require 'rails_helper' - -RSpec.describe SpamCheck do - let!(:sender) { Fabricate(:account) } - let!(:alice) { Fabricate(:account, username: 'alice') } - let!(:bob) { Fabricate(:account, username: 'bob') } - - def status_with_html(text, options = {}) - status = PostStatusService.new.call(sender, { text: text }.merge(options)) - status.update_columns(text: Formatter.instance.format(status), local: false) - status - end - - describe '#hashable_text' do - it 'removes mentions from HTML for remote statuses' do - status = status_with_html('@alice Hello') - expect(described_class.new(status).hashable_text).to eq 'hello' - end - - it 'removes mentions from text for local statuses' do - status = PostStatusService.new.call(alice, text: "Hey @#{sender.username}, how are you?") - expect(described_class.new(status).hashable_text).to eq 'hey , how are you?' - end - end - - describe '#insufficient_data?' do - it 'returns true when there is no text' do - status = status_with_html('@alice') - expect(described_class.new(status).insufficient_data?).to be true - end - - it 'returns false when there is text' do - status = status_with_html('@alice h') - expect(described_class.new(status).insufficient_data?).to be false - end - end - - describe '#digest' do - it 'returns a string' do - status = status_with_html('@alice Hello world') - expect(described_class.new(status).digest).to be_a String - end - end - - describe '#spam?' do - it 'returns false for a unique status' do - status = status_with_html('@alice Hello') - expect(described_class.new(status).spam?).to be false - end - - it 'returns false for different statuses to the same recipient' do - status1 = status_with_html('@alice Hello') - described_class.new(status1).remember! - status2 = status_with_html('@alice Are you available to talk?') - expect(described_class.new(status2).spam?).to be false - end - - it 'returns false for statuses with different content warnings' do - status1 = status_with_html('@alice Are you available to talk?') - described_class.new(status1).remember! - status2 = status_with_html('@alice Are you available to talk?', spoiler_text: 'This is a completely different matter than what I was talking about previously, I swear!') - expect(described_class.new(status2).spam?).to be false - end - - it 'returns false for different statuses to different recipients' do - status1 = status_with_html('@alice How is it going?') - described_class.new(status1).remember! - status2 = status_with_html('@bob Are you okay?') - expect(described_class.new(status2).spam?).to be false - end - - it 'returns false for very short different statuses to different recipients' do - status1 = status_with_html('@alice 🙄') - described_class.new(status1).remember! - status2 = status_with_html('@bob Huh?') - expect(described_class.new(status2).spam?).to be false - end - - it 'returns false for statuses with no text' do - status1 = status_with_html('@alice') - described_class.new(status1).remember! - status2 = status_with_html('@bob') - expect(described_class.new(status2).spam?).to be false - end - - it 'returns true for duplicate statuses to the same recipient' do - described_class::THRESHOLD.times do - status1 = status_with_html('@alice Hello') - described_class.new(status1).remember! - end - - status2 = status_with_html('@alice Hello') - expect(described_class.new(status2).spam?).to be true - end - - it 'returns true for duplicate statuses to different recipients' do - described_class::THRESHOLD.times do - status1 = status_with_html('@alice Hello') - described_class.new(status1).remember! - end - - status2 = status_with_html('@bob Hello') - expect(described_class.new(status2).spam?).to be true - end - - it 'returns true for nearly identical statuses with random numbers' do - source_text = 'Sodium, atomic number 11, was first isolated by Humphry Davy in 1807. A chemical component of salt, he named it Na in honor of the saltiest region on earth, North America.' - - described_class::THRESHOLD.times do - status1 = status_with_html('@alice ' + source_text + ' 1234') - described_class.new(status1).remember! - end - - status2 = status_with_html('@bob ' + source_text + ' 9568') - expect(described_class.new(status2).spam?).to be true - end - end - - describe '#skip?' do - it 'returns true when the sender is already silenced' do - status = status_with_html('@alice Hello') - sender.silence! - expect(described_class.new(status).skip?).to be true - end - - it 'returns true when the mentioned person follows the sender' do - status = status_with_html('@alice Hello') - alice.follow!(sender) - expect(described_class.new(status).skip?).to be true - end - - it 'returns false when even one mentioned person doesn\'t follow the sender' do - status = status_with_html('@alice @bob Hello') - alice.follow!(sender) - expect(described_class.new(status).skip?).to be false - end - - it 'returns true when the sender is replying to a status that mentions the sender' do - parent = PostStatusService.new.call(alice, text: "Hey @#{sender.username}, how are you?") - status = status_with_html('@alice @bob Hello', thread: parent) - expect(described_class.new(status).skip?).to be true - end - end - - describe '#remember!' do - let(:status) { status_with_html('@alice') } - let(:spam_check) { described_class.new(status) } - let(:redis_key) { spam_check.send(:redis_key) } - - it 'remembers' do - expect(Redis.current.exists?(redis_key)).to be true - spam_check.remember! - expect(Redis.current.exists?(redis_key)).to be true - end - end - - describe '#reset!' do - let(:status) { status_with_html('@alice') } - let(:spam_check) { described_class.new(status) } - let(:redis_key) { spam_check.send(:redis_key) } - - before do - spam_check.remember! - end - - it 'resets' do - expect(Redis.current.exists?(redis_key)).to be true - spam_check.reset! - expect(Redis.current.exists?(redis_key)).to be false - end - end - - describe '#flag!' do - let!(:status1) { status_with_html('@alice General Kenobi you are a bold one') } - let!(:status2) { status_with_html('@alice @bob General Kenobi, you are a bold one') } - - before do - described_class.new(status1).remember! - described_class.new(status2).flag! - end - - it 'creates a report about the account' do - expect(sender.targeted_reports.unresolved.count).to eq 1 - end - - it 'attaches both matching statuses to the report' do - expect(sender.targeted_reports.first.status_ids).to include(status1.id, status2.id) - end - end -end