Skip to content

Commit 7566ddf

Browse files
enhance spammy category filter (commons-app#6167)
Signed-off-by: parneet-guraya <gurayaparneet@gmail.com>
1 parent e653857 commit 7566ddf

File tree

3 files changed

+61
-24
lines changed

3 files changed

+61
-24
lines changed

app/.attach_pid781771

Whitespace-only changes.

app/src/main/java/fr/free/nrw/commons/category/CategoriesModel.kt

Lines changed: 22 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -36,37 +36,35 @@ class CategoriesModel
3636
* @return
3737
*/
3838
fun isSpammyCategory(item: String): Boolean {
39-
// Check for current and previous year to exclude these categories from removal
40-
val now = Calendar.getInstance()
41-
val curYear = now[Calendar.YEAR]
42-
val curYearInString = curYear.toString()
43-
val prevYear = curYear - 1
44-
val prevYearInString = prevYear.toString()
45-
Timber.d("Previous year: %s", prevYearInString)
46-
47-
val mentionsDecade = item.matches(".*0s.*".toRegex())
48-
val recentDecade = item.matches(".*20[0-2]0s.*".toRegex())
49-
val spammyCategory =
50-
item.matches("(.*)needing(.*)".toRegex()) ||
51-
item.matches("(.*)taken on(.*)".toRegex())
5239

5340
// always skip irrelevant categories such as Media_needing_categories_as_of_16_June_2017(Issue #750)
41+
val spammyCategory = item.matches("(.*)needing(.*)".toRegex())
42+
|| item.matches("(.*)taken on(.*)".toRegex())
43+
44+
// checks for
45+
// dd/mm/yyyy or yy
46+
// yyyy or yy/mm/dd
47+
// yyyy or yy/mm
48+
// mm/yyyy or yy
49+
// for `yy` it is assumed that 20XX is implicit.
50+
// with separators [., /, -]
51+
val isIrrelevantCategory =
52+
item.contains("""\d{1,2}[-/.]\d{1,2}[-/.]\d{2,4}|\d{2,4}[-/.]\d{1,2}[-/.]\d{1,2}|\d{2,4}[-/.]\d{1,2}|\d{1,2}[-/.]\d{2,4}""".toRegex())
53+
54+
5455
if (spammyCategory) {
5556
return true
5657
}
5758

58-
if (mentionsDecade) {
59-
// Check if the year in the form of XX(X)0s is recent/relevant, i.e. in the 2000s or 2010s/2020s as stated in Issue #1029
60-
// Example: "2020s" is OK, but "1920s" is not (and should be skipped)
61-
return !recentDecade
62-
} else {
63-
// If it is not an year in decade form (e.g. 19xxs/20xxs), then check if item contains a 4-digit year
64-
// anywhere within the string (.* is wildcard) (Issue #47)
65-
// And that item does not equal the current year or previous year
66-
return item.matches(".*(19|20)\\d{2}.*".toRegex()) &&
67-
!item.contains(curYearInString) &&
68-
!item.contains(prevYearInString)
59+
if(isIrrelevantCategory){
60+
return true
6961
}
62+
63+
val hasYear = item.matches("(.*\\d{4}.*)".toRegex())
64+
val validYearsRange = item.matches(".*(20[0-9]{2}).*".toRegex())
65+
66+
// finally if there's 4 digits year exists in XXXX it should only be in 20XX range.
67+
return hasYear && !validYearsRange
7068
}
7169

7270
/**

app/src/test/kotlin/fr/free/nrw/commons/category/CategoriesModelTest.kt

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import fr.free.nrw.commons.upload.GpsCategoryModel
1111
import io.reactivex.Single
1212
import io.reactivex.subjects.BehaviorSubject
1313
import media
14+
import org.junit.Assert
1415
import org.junit.Before
1516
import org.junit.Test
1617
import org.mockito.ArgumentMatchers
@@ -331,4 +332,42 @@ class CategoriesModelTest {
331332
media(),
332333
)
333334
}
335+
336+
@Test
337+
fun `test valid input with XXXX in it between the expected range 20XX`() {
338+
val input = categoriesModel.isSpammyCategory("Amavenita (ship, 2014)")
339+
Assert.assertFalse(input)
340+
}
341+
342+
@Test
343+
fun `test valid input with XXXXs in it between the expected range 20XXs`() {
344+
val input = categoriesModel.isSpammyCategory("Amavenita (ship, 2014s)")
345+
Assert.assertFalse(input)
346+
}
347+
348+
@Test
349+
fun `test invalid category when have needing in the input`() {
350+
val input = categoriesModel.isSpammyCategory("Media needing categories as of 30 March 2017")
351+
Assert.assertTrue(input)
352+
}
353+
354+
@Test
355+
fun `test invalid category when have taken on in the input`() {
356+
val input = categoriesModel.isSpammyCategory("Photographs taken on 2015-12-08")
357+
Assert.assertTrue(input)
358+
}
359+
360+
@Test
361+
fun `test invalid category when have yy mm or yy mm dd in the input`() {
362+
// filtering based on [., /, -] separators between the dates.
363+
val input = categoriesModel.isSpammyCategory("Image class 09.14")
364+
Assert.assertTrue(input)
365+
}
366+
367+
@Test
368+
fun `test invalid category when have years not in 20XX range`() {
369+
val input = categoriesModel.isSpammyCategory("Japan in the 1400s")
370+
Assert.assertTrue(input)
371+
}
372+
334373
}

0 commit comments

Comments
 (0)