From 9db85085a9c0eacd2fc8edaa9cc9259b36da01e5 Mon Sep 17 00:00:00 2001 From: Matt Jankowski Date: Wed, 3 May 2017 10:59:31 -0400 Subject: [PATCH] Language improvements, replace whatlanguage with CLD (#2753) * add failing en specs * add cld2 gem * Replace WhatLanguage with CLD --- Gemfile | 2 +- Gemfile.lock | 6 ++++-- app/lib/language_detector.rb | 14 +++++++++++++- spec/lib/language_detector_spec.rb | 22 ++++++++++++++-------- 4 files changed, 32 insertions(+), 12 deletions(-) diff --git a/Gemfile b/Gemfile index 1287afe441..d84597a787 100644 --- a/Gemfile +++ b/Gemfile @@ -20,6 +20,7 @@ gem 'paperclip', '~> 5.1' gem 'paperclip-av-transcoder' gem 'addressable' +gem 'cld2', require: 'cld' gem 'devise' gem 'devise-two-factor' gem 'doorkeeper' @@ -56,7 +57,6 @@ gem 'statsd-instrument' gem 'twitter-text' gem 'tzinfo-data' gem 'webpacker', '~>1.2' -gem 'whatlanguage' # For some reason the view specs start failing without this gem 'react-rails' diff --git a/Gemfile.lock b/Gemfile.lock index 218e17237c..f4b307cec9 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -102,6 +102,8 @@ GEM rack-test (>= 0.5.4) xpath (~> 2.0) chunky_png (1.3.8) + cld2 (1.0.3) + ffi (~> 1.9.3) climate_control (0.1.0) cocaine (0.5.8) climate_control (>= 0.0.3, < 1.0) @@ -153,6 +155,7 @@ GEM faker (1.7.3) i18n (~> 0.5) fast_blank (1.0.0) + ffi (1.9.18) fuubar (2.2.0) rspec-core (~> 3.0) ruby-progressbar (~> 1.4) @@ -463,7 +466,6 @@ GEM websocket-driver (0.6.5) websocket-extensions (>= 0.1.0) websocket-extensions (0.1.2) - whatlanguage (1.0.6) xpath (2.0.0) nokogiri (~> 1.3) @@ -484,6 +486,7 @@ DEPENDENCIES capistrano-rbenv capistrano-yarn capybara + cld2 devise devise-two-factor doorkeeper @@ -549,7 +552,6 @@ DEPENDENCIES uglifier (>= 1.3.0) webmock webpacker (~> 1.2) - whatlanguage RUBY VERSION ruby 2.4.1p111 diff --git a/app/lib/language_detector.rb b/app/lib/language_detector.rb index 9a32d6a642..8c1751beb8 100644 --- a/app/lib/language_detector.rb +++ b/app/lib/language_detector.rb @@ -9,11 +9,23 @@ class LanguageDetector end def to_iso_s - WhatLanguage.new(:all).language_iso(text_without_urls) || default_locale.to_sym + detected_language_code || default_locale.to_sym end private + def detected_language_code + detected_language[:code].to_sym if detected_language_reliable? + end + + def detected_language + @_detected_language ||= CLD.detect_language(text_without_urls) + end + + def detected_language_reliable? + detected_language[:reliable] + end + def text_without_urls text.dup.tap do |new_text| URI.extract(new_text).each do |url| diff --git a/spec/lib/language_detector_spec.rb b/spec/lib/language_detector_spec.rb index 5fb19a1e7d..bd4e65ef8e 100644 --- a/spec/lib/language_detector_spec.rb +++ b/spec/lib/language_detector_spec.rb @@ -3,11 +3,17 @@ require 'rails_helper' describe LanguageDetector do describe 'to_iso_s' do - it 'detects english language' do - string = 'Hello and welcome to mastodon' - result = described_class.new(string).to_iso_s + it 'detects english language for basic strings' do + strings = [ + "Hello and welcome to mastodon", + "I'd rather not!", + "a lot of people just want to feel righteous all the time and that's all that matters", + ] + strings.each do |string| + result = described_class.new(string).to_iso_s - expect(result).to eq :en + expect(result).to eq(:en), string + end end it 'detects spanish language' do @@ -19,15 +25,15 @@ describe LanguageDetector do describe 'when language can\'t be detected' do it 'confirm language engine cant detect' do - result = WhatLanguage.new(:all).language_iso('') - expect(result).to be_nil + result = CLD.detect_language('') + expect(result[:reliable]).to be false end describe 'because of a URL' do it 'uses default locale when sent just a URL' do string = 'http://example.com/media/2kFTgOJLXhQf0g2nKB4' - wl_result = WhatLanguage.new(:all).language_iso(string) - expect(wl_result).not_to eq :en + cld_result = CLD.detect_language(string)[:code] + expect(cld_result).not_to eq :en result = described_class.new(string).to_iso_s