Skip to content

Commit 278e36c

Browse files
FIX: Better handling of syllables in rewind writing analysis (#36616)
Several users were ending up with negative values in their readability scores for Rewind. The issue was with the syllable counting regex for the old query: ``` regexp_count(lower(plain), '[aeiouy]+') AS syllables ``` This regex counted consecutive vowel clusters, not syllables. This caued massive overcounting: - Technical posts with URLs/emails have lots of vowel sequences - Markdown code blocks with variable names - Words like "queue" (1 syllable) count as 2+ vowel clusters - The pattern [aeiouy]+ matches any consecutive vowels as a single match, but doesn't account for proper syllable boundaries For an example data of 20,181 words, avg score -49.94, the formula was `-49.94 = 206.835 - 1.015*(words/sentences) - 84.6*(syllables/words)` This means syllables/words ratio is extremely high (likely 2.5-3.0+ instead of the realistic ~1.4). To fix this, we instead use an approximation of syllable counting for English, using an average of 1.45 syllables per word. This is a common heuristic used in readability formulas when precise syllable counts are not available.
1 parent e67d9b5 commit 278e36c

File tree

2 files changed

+218
-7
lines changed

2 files changed

+218
-7
lines changed

plugins/discourse-rewind/app/services/discourse_rewind/action/writing_analysis.rb

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,15 @@ def call
3636
post_count.first > 0 ? (total_words.first.to_f / post_count.first).round(2) : 0
3737

3838
# Calculated using the Flesch Reading Ease formula,
39-
# with an approximation for syllables since this can
40-
# be tricky to get right in SQL.
39+
# with a statistical approximation for syllables (1.45 per word,
40+
# which is the average for English text). This is more reliable
41+
# than regex-based syllable counting which can be thrown off by
42+
# URLs, code blocks, and technical terminology.
4143
#
42-
# Tries to handle short sentences or ones without delmiters
44+
# Tries to handle short sentences or ones without delimiters
4345
# and ending with emojis by treating them as a single sentence.
46+
#
47+
# Scores are bounded between 0-100 to prevent extreme negative values.
4448
readability_score =
4549
DB.query_single(<<~SQL, user_id: user.id, start: date.first, end: date.last)
4650
WITH cleaned AS (
@@ -62,7 +66,7 @@ def call
6266
plain,
6367
word_count AS words,
6468
regexp_count(plain, '[.!?;:](\s|$)') AS sentences_raw,
65-
regexp_count(lower(plain), '[aeiouy]+') AS syllables
69+
(word_count * 1.45) AS syllables
6670
FROM cleaned
6771
),
6872
scores AS (
@@ -79,18 +83,18 @@ def call
7983
ELSE sentences_raw
8084
END AS sentences_fixed,
8185
82-
-- Flesch Reading Ease formula
86+
-- Flesch Reading Ease formula with bounds (0-100)
8387
CASE
8488
WHEN words = 0 THEN NULL
8589
WHEN (CASE WHEN sentences_raw = 0 AND words > 5 THEN 1 ELSE sentences_raw END) = 0 THEN NULL
86-
ELSE (
90+
ELSE GREATEST(0, LEAST(100,
8791
206.835
8892
- 1.015 * (
8993
words::float /
9094
(CASE WHEN sentences_raw = 0 AND words > 5 THEN 1 ELSE sentences_raw END)
9195
)
9296
- 84.6 * (syllables::float / words)
93-
)
97+
))
9498
END AS readability_score
9599
96100
FROM metrics
Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
# frozen_string_literal: true
2+
3+
RSpec.describe DiscourseRewind::Action::WritingAnalysis do
4+
fab!(:date) { Date.new(2021).all_year }
5+
fab!(:user) { Fabricate(:user, refresh_auto_groups: true) }
6+
fab!(:other_user, :user)
7+
8+
fab!(:post1) do
9+
Fabricate(
10+
:post,
11+
user: user,
12+
raw: "This is a simple post. It has two sentences.",
13+
created_at: random_datetime,
14+
)
15+
end
16+
17+
fab!(:post2) do
18+
Fabricate(
19+
:post,
20+
user: user,
21+
raw:
22+
"Here is another post with more content. It contains multiple sentences. This helps test the readability calculation.",
23+
created_at: random_datetime,
24+
)
25+
end
26+
27+
fab!(:post3) do
28+
Fabricate(
29+
:post,
30+
user: user,
31+
raw:
32+
"A longer post with various sentence structures. Some are short. Others are quite a bit longer and contain more complex vocabulary and punctuation! Does this affect the score?",
33+
created_at: random_datetime,
34+
)
35+
end
36+
37+
fab!(:other_user_post) do
38+
Fabricate(
39+
:post,
40+
user: other_user,
41+
raw: "This post is from another user and should not be included.",
42+
created_at: random_datetime,
43+
)
44+
end
45+
46+
describe ".call" do
47+
it "calculates total words correctly" do
48+
result = call_report
49+
50+
expect(result[:data][:total_words]).to be > 0
51+
expect(result[:data][:total_words]).to eq(
52+
[post1, post2, post3].sum { |p| p.reload.word_count },
53+
)
54+
end
55+
56+
it "calculates total posts correctly" do
57+
result = call_report
58+
59+
expect(result[:data][:total_posts]).to eq(3)
60+
end
61+
62+
it "calculates average post length correctly" do
63+
result = call_report
64+
65+
total_words = [post1, post2, post3].sum { |p| p.reload.word_count }
66+
expected_avg = (total_words.to_f / 3).round(2)
67+
68+
expect(result[:data][:average_post_length]).to eq(expected_avg)
69+
end
70+
71+
it "calculates readability score" do
72+
result = call_report
73+
74+
expect(result[:data][:readability_score]).to be_present
75+
expect(result[:data][:readability_score]).to be_a(Numeric)
76+
end
77+
78+
it "returns correct identifier" do
79+
result = call_report
80+
81+
expect(result[:identifier]).to eq("writing-analysis")
82+
end
83+
84+
it "bounds readability score between 0 and 100" do
85+
result = call_report
86+
87+
score = result[:data][:readability_score]
88+
expect(score).to be >= 0
89+
expect(score).to be <= 100
90+
end
91+
92+
context "when user has posts with very long sentences" do
93+
fab!(:long_sentence_post) do
94+
Fabricate(
95+
:post,
96+
user: user,
97+
raw:
98+
"This is an extremely long sentence that goes on and on without any punctuation to break it up which would normally result in a very low readability score because readers generally find it difficult to follow sentences that contain too many clauses and ideas without pausing for breath or mental processing time",
99+
created_at: random_datetime,
100+
)
101+
end
102+
103+
it "handles low readability scores" do
104+
result = call_report
105+
106+
expect(result[:data][:readability_score]).to be >= 0
107+
end
108+
end
109+
110+
context "when user has posts without punctuation" do
111+
fab!(:no_punctuation_post) do
112+
Fabricate(
113+
:post,
114+
user: user,
115+
raw: "Just some words without any sentence ending punctuation",
116+
created_at: random_datetime,
117+
)
118+
end
119+
120+
it "treats posts as having at least one sentence" do
121+
result = call_report
122+
123+
expect(result[:data][:readability_score]).to be_present
124+
end
125+
end
126+
127+
context "when a post is deleted" do
128+
before { post1.trash!(Discourse.system_user) }
129+
130+
it "does not include deleted posts in total posts" do
131+
result = call_report
132+
133+
expect(result[:data][:total_posts]).to eq(2)
134+
end
135+
136+
it "does not include deleted posts in total words" do
137+
result = call_report
138+
139+
total_words = [post2, post3].sum { |p| p.reload.word_count }
140+
141+
expect(result[:data][:total_words]).to eq(total_words)
142+
end
143+
end
144+
145+
context "when posts are from another user" do
146+
it "does not include other users' posts in total posts" do
147+
result = call_report
148+
149+
expect(result[:data][:total_posts]).to eq(3)
150+
end
151+
152+
it "does not include other users' posts in total words" do
153+
result = call_report
154+
155+
expected_words = [post1, post2, post3].sum { |p| p.reload.word_count }
156+
157+
expect(result[:data][:total_words]).to eq(expected_words)
158+
end
159+
end
160+
161+
context "when user has no posts" do
162+
fab!(:user_with_no_posts, :user)
163+
fab!(:date) { Date.new(2021).all_year }
164+
165+
it "returns zero values gracefully" do
166+
result = described_class.call(user: user_with_no_posts, date: date, guardian: user.guardian)
167+
168+
expect(result[:data][:total_words]).to be_nil
169+
expect(result[:data][:total_posts]).to eq(0)
170+
expect(result[:data][:average_post_length]).to eq(0)
171+
end
172+
end
173+
174+
context "with posts containing HTML and markdown" do
175+
fab!(:formatted_post) do
176+
Fabricate(
177+
:post,
178+
user: user,
179+
raw:
180+
"**Bold text** and *italic text*. [A link](https://example.com) and some code `var x = 1;`",
181+
created_at: random_datetime,
182+
)
183+
end
184+
185+
it "strips HTML from readability calculation" do
186+
result = call_report
187+
188+
expect(result[:data][:readability_score]).to be_present
189+
expect(result[:data][:readability_score]).to be > 0
190+
end
191+
end
192+
end
193+
194+
context "when in rails development mode" do
195+
before { Rails.env.stubs(:development?).returns(true) }
196+
197+
it "returns fake data" do
198+
result = call_report
199+
200+
expect(result[:identifier]).to eq("writing-analysis")
201+
expect(result[:data][:total_words]).to eq(45_230)
202+
expect(result[:data][:total_posts]).to eq(197)
203+
expect(result[:data][:average_post_length]).to eq(230)
204+
expect(result[:data][:readability_score]).to eq(65.4)
205+
end
206+
end
207+
end

0 commit comments

Comments
 (0)