Coverage for src/crawler/by_source/rcm_crawler.py: 82%

80 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-07-30 09:47 +0000

1import regex 

2from bs4 import BeautifulSoup, Tag 

3from lingua import Language, LanguageDetectorBuilder 

4from ptf.model_data import IssueData, create_articledata, create_contributor, create_extlink 

5 

6from crawler.base_crawler import BaseCollectionCrawler 

7from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

8 

9 

10class RcmCrawler(BaseCollectionCrawler): 

11 source_name = "Sociedad Colombiana de Matemáticas" 

12 source_domain = "RCM" 

13 source_website = "https://scm.org.co/web/publicaciones/" 

14 

15 base_url = "https://scm.org.co/archivos/revista/index.php" 

16 

17 issue_regex = regex.compile(r"\((?P<year>\d+)\) Vol\. (?P<volume>\d+) No\. (?P<number>.*)") 

18 article_regex = regex.compile( 

19 r"(?P<authors>.+(?:&.+)?)?(?:,)(?P<title>.+) REVISTA COLOMBIANA DE MATEMÁTICAS.+Páginas (?:(?P<firstpage>\d+) ?--? ?(?P<lastpage>\d+))?.+Formato \[PDF\] \((?P<size>\d+) K\)" 

20 ) 

21 

22 language_detector = LanguageDetectorBuilder.from_languages( 

23 Language.ENGLISH, Language.SERBIAN 

24 ).build() 

25 

26 def parse_collection_content(self, content): 

27 soup = BeautifulSoup(content, "html.parser") 

28 xissues: list[IssueData] = [] 

29 

30 issue_tags = soup.select("table td") 

31 for tag in issue_tags: 

32 issue_str = cleanup_str(tag.text) 

33 issue_match = self.issue_regex.search(issue_str) 

34 if not issue_match: 

35 continue 

36 issue_data = issue_match.groupdict() 

37 href_tag = tag.select_one("a") 

38 if not href_tag: 38 ↛ 39line 38 didn't jump to line 39 because the condition on line 38 was never true

39 raise ValueError("Cannot find issue url") 

40 href = href_tag.get("href") 

41 if not isinstance(href, str): 41 ↛ 42line 41 didn't jump to line 42 because the condition on line 41 was never true

42 raise ValueError("Cannot parse issue url") 

43 if href.endswith("&Vol=&Num="): 43 ↛ 44line 43 didn't jump to line 44 because the condition on line 43 was never true

44 continue 

45 href = self.base_url + href 

46 

47 xissues.append( 

48 self.create_xissue( 

49 href, issue_data["year"], issue_data["volume"], issue_data["number"] 

50 ) 

51 ) 

52 

53 return xissues 

54 

55 def parse_issue_content(self, content, xissue: IssueData, index: int = 0): 

56 soup = BeautifulSoup(content, "html.parser") 

57 articles_tags = soup.select("div.wpb_content_element > p") 

58 articles_tags.pop(0) 

59 index = 0 

60 if not xissue.url: 60 ↛ 61line 60 didn't jump to line 61 because the condition on line 60 was never true

61 raise ValueError("Issue url is not set") 

62 for tag in articles_tags: 

63 if tag.text == "": 63 ↛ 64line 63 didn't jump to line 64 because the condition on line 63 was never true

64 continue 

65 text = cleanup_str(tag.text) 

66 article_match = self.article_regex.search(text) 

67 pid = f"{xissue.pid}_a{index}" 

68 index += 1 

69 if not article_match: 69 ↛ 70line 69 didn't jump to line 70 because the condition on line 69 was never true

70 self.logger.debug("Cannot parse title. Skipping Article", extra={"pid": pid}) 

71 continue 

72 article_data = article_match.groupdict() 

73 

74 if article_data["size"] == "0": 

75 self.logger.debug("Article has no pdf. Skipping Article", extra={"pid": pid}) 

76 continue 

77 

78 xarticle = create_articledata() 

79 xarticle.pid = pid 

80 

81 # Authors 

82 if article_data["authors"]: 82 ↛ 94line 82 didn't jump to line 94 because the condition on line 82 was always true

83 authors = article_data["authors"].replace("&", ",") 

84 authors = regex.sub(r"\(.*\)", "", authors) 

85 authors = authors.split(",") 

86 for a in authors: 

87 a = cleanup_str(a) 

88 if len(a) > 0: 88 ↛ 86line 88 didn't jump to line 86 because the condition on line 88 was always true

89 xarticle.contributors.append( 

90 create_contributor(role="author", string_name=a) 

91 ) 

92 

93 # Pages 

94 if article_data["firstpage"] and article_data["lastpage"]: 94 ↛ 99line 94 didn't jump to line 99 because the condition on line 94 was always true

95 xarticle.fpage = article_data["firstpage"] 

96 xarticle.lpage = article_data["lastpage"] 

97 

98 # Link 

99 ext_link = create_extlink( 

100 rel="source", location=xissue.url, metadata=self.source_domain 

101 ) 

102 xarticle.ext_links.append(ext_link) 

103 

104 # Title 

105 xarticle.title_tex = article_data["title"] 

106 # PDF 

107 pdf_tag = tag.find("a", text="[PDF]") 

108 if not isinstance(pdf_tag, Tag): 108 ↛ 109line 108 didn't jump to line 109 because the condition on line 108 was never true

109 raise ValueError(f"{self.source_domain}] {xissue.pid} Cannot find pdf link") 

110 pdf_link = pdf_tag.get("href") 

111 if not isinstance(pdf_link, str): 111 ↛ 112line 111 didn't jump to line 112 because the condition on line 111 was never true

112 raise ValueError(f"{self.source_domain}] {xissue.pid} Cannot parse pdf link") 

113 add_pdf_link_to_xarticle(xarticle, "https://scm.org.co" + pdf_link) 

114 

115 xissue.articles.append(xarticle)