FIX: Better handling of syllables in rewind writing analysis (#36616)

martin-brennan · web-flow · commit 278e36c4d15b · 2025-12-11T13:34:12.000+10:00
Several users were ending up with negative values in their readability
scores for Rewind. The issue was with the syllable counting regex for
the
old query:

```
regexp_count(lower(plain), '[aeiouy]+')                AS syllables
```

This regex counted consecutive vowel clusters, not syllables. This caued
massive overcounting:

- Technical posts with URLs/emails have lots of vowel sequences
- Markdown code blocks with variable names
- Words like "queue" (1 syllable) count as 2+ vowel clusters
- The pattern [aeiouy]+ matches any consecutive vowels as a single
  match, but doesn't account for proper syllable boundaries

For an example data of 20,181 words, avg score -49.94, the formula
was `-49.94 = 206.835 - 1.015*(words/sentences) -
84.6*(syllables/words)`

This means syllables/words ratio is extremely high (likely 2.5-3.0+
instead of the realistic ~1.4).

To fix this, we instead use an approximation of syllable counting for
English, using an average of 1.45 syllables per word. This is a common
heuristic used in readability formulas when precise syllable counts are
not available.
diff --git a/plugins/discourse-rewind/app/services/discourse_rewind/action/writing_analysis.rb b/plugins/discourse-rewind/app/services/discourse_rewind/action/writing_analysis.rb
@@ -36,11 +36,15 @@ def call
           post_count.first > 0 ? (total_words.first.to_f / post_count.first).round(2) : 0
 
         # Calculated using the Flesch Reading Ease formula,
-        # with an approximation for syllables since this can
-        # be tricky to get right in SQL.
+        # with a statistical approximation for syllables (1.45 per word,
+        # which is the average for English text). This is more reliable
+        # than regex-based syllable counting which can be thrown off by
+        # URLs, code blocks, and technical terminology.
         #
-        # Tries to handle short sentences or ones without delmiters
+        # Tries to handle short sentences or ones without delimiters
         # and ending with emojis by treating them as a single sentence.
+        #
+        # Scores are bounded between 0-100 to prevent extreme negative values.
         readability_score =
           DB.query_single(<<~SQL, user_id: user.id, start: date.first, end: date.last)
           WITH cleaned AS (
@@ -62,7 +66,7 @@ def call
               plain,
               word_count AS words,
               regexp_count(plain, '[.!?;:](\s|$)')                   AS sentences_raw,
-              regexp_count(lower(plain), '[aeiouy]+')                AS syllables
+              (word_count * 1.45)                                    AS syllables
             FROM cleaned
           ),
           scores AS (
@@ -79,18 +83,18 @@ def call
                 ELSE sentences_raw
               END AS sentences_fixed,
 
-              -- Flesch Reading Ease formula
+              -- Flesch Reading Ease formula with bounds (0-100)
               CASE
                 WHEN words = 0 THEN NULL
                 WHEN (CASE WHEN sentences_raw = 0 AND words > 5 THEN 1 ELSE sentences_raw END) = 0 THEN NULL
-                ELSE (
+                ELSE GREATEST(0, LEAST(100,
                   206.835
                   - 1.015 * (
                       words::float /
                       (CASE WHEN sentences_raw = 0 AND words > 5 THEN 1 ELSE sentences_raw END)
                     )
                   - 84.6  * (syllables::float / words)
-                )
+                ))
               END AS readability_score
 
             FROM metrics
diff --git a/plugins/discourse-rewind/spec/actions/writing_analysis_spec.rb b/plugins/discourse-rewind/spec/actions/writing_analysis_spec.rb
@@ -0,0 +1,207 @@
+# frozen_string_literal: true
+
+RSpec.describe DiscourseRewind::Action::WritingAnalysis do
+  fab!(:date) { Date.new(2021).all_year }
+  fab!(:user) { Fabricate(:user, refresh_auto_groups: true) }
+  fab!(:other_user, :user)
+
+  fab!(:post1) do
+    Fabricate(
+      :post,
+      user: user,
+      raw: "This is a simple post. It has two sentences.",
+      created_at: random_datetime,
+    )
+  end
+
+  fab!(:post2) do
+    Fabricate(
+      :post,
+      user: user,
+      raw:
+        "Here is another post with more content. It contains multiple sentences. This helps test the readability calculation.",
+      created_at: random_datetime,
+    )
+  end
+
+  fab!(:post3) do
+    Fabricate(
+      :post,
+      user: user,
+      raw:
+        "A longer post with various sentence structures. Some are short. Others are quite a bit longer and contain more complex vocabulary and punctuation! Does this affect the score?",
+      created_at: random_datetime,
+    )
+  end
+
+  fab!(:other_user_post) do
+    Fabricate(
+      :post,
+      user: other_user,
+      raw: "This post is from another user and should not be included.",
+      created_at: random_datetime,
+    )
+  end
+
+  describe ".call" do
+    it "calculates total words correctly" do
+      result = call_report
+
+      expect(result[:data][:total_words]).to be > 0
+      expect(result[:data][:total_words]).to eq(
+        [post1, post2, post3].sum { |p| p.reload.word_count },
+      )
+    end
+
+    it "calculates total posts correctly" do
+      result = call_report
+
+      expect(result[:data][:total_posts]).to eq(3)
+    end
+
+    it "calculates average post length correctly" do
+      result = call_report
+
+      total_words = [post1, post2, post3].sum { |p| p.reload.word_count }
+      expected_avg = (total_words.to_f / 3).round(2)
+
+      expect(result[:data][:average_post_length]).to eq(expected_avg)
+    end
+
+    it "calculates readability score" do
+      result = call_report
+
+      expect(result[:data][:readability_score]).to be_present
+      expect(result[:data][:readability_score]).to be_a(Numeric)
+    end
+
+    it "returns correct identifier" do
+      result = call_report
+
+      expect(result[:identifier]).to eq("writing-analysis")
+    end
+
+    it "bounds readability score between 0 and 100" do
+      result = call_report
+
+      score = result[:data][:readability_score]
+      expect(score).to be >= 0
+      expect(score).to be <= 100
+    end
+
+    context "when user has posts with very long sentences" do
+      fab!(:long_sentence_post) do
+        Fabricate(
+          :post,
+          user: user,
+          raw:
+            "This is an extremely long sentence that goes on and on without any punctuation to break it up which would normally result in a very low readability score because readers generally find it difficult to follow sentences that contain too many clauses and ideas without pausing for breath or mental processing time",
+          created_at: random_datetime,
+        )
+      end
+
+      it "handles low readability scores" do
+        result = call_report
+
+        expect(result[:data][:readability_score]).to be >= 0
+      end
+    end
+
+    context "when user has posts without punctuation" do
+      fab!(:no_punctuation_post) do
+        Fabricate(
+          :post,
+          user: user,
+          raw: "Just some words without any sentence ending punctuation",
+          created_at: random_datetime,
+        )
+      end
+
+      it "treats posts as having at least one sentence" do
+        result = call_report
+
+        expect(result[:data][:readability_score]).to be_present
+      end
+    end
+
+    context "when a post is deleted" do
+      before { post1.trash!(Discourse.system_user) }
+
+      it "does not include deleted posts in total posts" do
+        result = call_report
+
+        expect(result[:data][:total_posts]).to eq(2)
+      end
+
+      it "does not include deleted posts in total words" do
+        result = call_report
+
+        total_words = [post2, post3].sum { |p| p.reload.word_count }
+
+        expect(result[:data][:total_words]).to eq(total_words)
+      end
+    end
+
+    context "when posts are from another user" do
+      it "does not include other users' posts in total posts" do
+        result = call_report
+
+        expect(result[:data][:total_posts]).to eq(3)
+      end
+
+      it "does not include other users' posts in total words" do
+        result = call_report
+
+        expected_words = [post1, post2, post3].sum { |p| p.reload.word_count }
+
+        expect(result[:data][:total_words]).to eq(expected_words)
+      end
+    end
+
+    context "when user has no posts" do
+      fab!(:user_with_no_posts, :user)
+      fab!(:date) { Date.new(2021).all_year }
+
+      it "returns zero values gracefully" do
+        result = described_class.call(user: user_with_no_posts, date: date, guardian: user.guardian)
+
+        expect(result[:data][:total_words]).to be_nil
+        expect(result[:data][:total_posts]).to eq(0)
+        expect(result[:data][:average_post_length]).to eq(0)
+      end
+    end
+
+    context "with posts containing HTML and markdown" do
+      fab!(:formatted_post) do
+        Fabricate(
+          :post,
+          user: user,
+          raw:
+            "**Bold text** and *italic text*. [A link](https://example.com) and some code `var x = 1;`",
+          created_at: random_datetime,
+        )
+      end
+
+      it "strips HTML from readability calculation" do
+        result = call_report
+
+        expect(result[:data][:readability_score]).to be_present
+        expect(result[:data][:readability_score]).to be > 0
+      end
+    end
+  end
+
+  context "when in rails development mode" do
+    before { Rails.env.stubs(:development?).returns(true) }
+
+    it "returns fake data" do
+      result = call_report
+
+      expect(result[:identifier]).to eq("writing-analysis")
+      expect(result[:data][:total_words]).to eq(45_230)
+      expect(result[:data][:total_posts]).to eq(197)
+      expect(result[:data][:average_post_length]).to eq(230)
+      expect(result[:data][:readability_score]).to eq(65.4)
+    end
+  end
+end