Coverage for src/crawler/by_source/rcm_crawler.py: 82%

80 statements  

« prev     ^ index     » next       coverage.py v7.7.0, created at 2025-04-03 12:36 +0000

1import regex 

2from bs4 import BeautifulSoup, Tag 

3from lingua import Language, LanguageDetectorBuilder 

4from ptf.model_data import IssueData, create_articledata, create_contributor, create_extlink 

5 

6from crawler.base_crawler import BaseCollectionCrawler 

7from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

8 

9 

10class RcmCrawler(BaseCollectionCrawler): 

11 source_name = "Sociedad Colombiana de Matemáticas" 

12 source_domain = "RCM" 

13 source_website = "https://scm.org.co/web/publicaciones/" 

14 

15 base_url = "https://scm.org.co/archivos/revista/index.php" 

16 

17 issue_regex = regex.compile(r"\((?P<year>\d+)\) Vol\. (?P<volume>\d+) No\. (?P<number>.*)") 

18 article_regex = regex.compile( 

19 r"(?P<authors>.+(?:&.+)?)?(?:,)(?P<title>.+) REVISTA COLOMBIANA DE MATEMÁTICAS.+Páginas (?:(?P<firstpage>\d+) ?--? ?(?P<lastpage>\d+))?.+Formato \[PDF\] \((?P<size>\d+) K\)" 

20 ) 

21 

22 language_detector = LanguageDetectorBuilder.from_languages( 

23 Language.ENGLISH, Language.SERBIAN 

24 ).build() 

25 

26 def parse_collection_content(self, content): 

27 soup = BeautifulSoup(content, "html.parser") 

28 xissues: list[IssueData] = [] 

29 

30 issue_tags = soup.select("table td") 

31 for tag in issue_tags: 

32 issue_str = cleanup_str(tag.text) 

33 issue_match = self.issue_regex.search(issue_str) 

34 if not issue_match: 

35 continue 

36 issue_data = issue_match.groupdict() 

37 href_tag = tag.select_one("a") 

38 if not href_tag: 38 ↛ 39line 38 didn't jump to line 39 because the condition on line 38 was never true

39 raise ValueError("Cannot find issue url") 

40 href = href_tag.get("href") 

41 if not isinstance(href, str): 41 ↛ 42line 41 didn't jump to line 42 because the condition on line 41 was never true

42 raise ValueError("Cannot parse issue url") 

43 if href.endswith("&Vol=&Num="): 43 ↛ 44line 43 didn't jump to line 44 because the condition on line 43 was never true

44 continue 

45 href = self.base_url + href 

46 

47 xissues.append( 

48 self.create_xissue( 

49 href, issue_data["year"], issue_data["volume"], issue_data["number"] 

50 ) 

51 ) 

52 

53 return xissues 

54 

55 def parse_issue_content(self, content, xissue: IssueData, index: int = 0): 

56 soup = BeautifulSoup(content, "html.parser") 

57 articles_tags = soup.select("div.wpb_content_element > p") 

58 articles_tags.pop(0) 

59 index = 0 

60 if not xissue.url: 60 ↛ 61line 60 didn't jump to line 61 because the condition on line 60 was never true

61 raise ValueError("Issue url is not set") 

62 for tag in articles_tags: 

63 if tag.text == "": 63 ↛ 64line 63 didn't jump to line 64 because the condition on line 63 was never true

64 continue 

65 text = cleanup_str(tag.text) 

66 article_match = self.article_regex.search(text) 

67 pid = f"{xissue.pid}_a{index}" 

68 index += 1 

69 if not article_match: 69 ↛ 70line 69 didn't jump to line 70 because the condition on line 69 was never true

70 print( 

71 f"[{self.source_domain}] {xissue.pid} : Cannot parse title. Skipping Article {pid}" 

72 ) 

73 continue 

74 article_data = article_match.groupdict() 

75 

76 if article_data["size"] == "0": 

77 print( 

78 f"[{self.source_domain}] {xissue.pid} : Article has no pdf. Skipping Article {pid}" 

79 ) 

80 continue 

81 

82 xarticle = create_articledata() 

83 xarticle.pid = pid 

84 

85 # Authors 

86 if article_data["authors"]: 86 ↛ 98line 86 didn't jump to line 98 because the condition on line 86 was always true

87 authors = article_data["authors"].replace("&", ",") 

88 authors = regex.sub(r"\(.*\)", "", authors) 

89 authors = authors.split(",") 

90 for a in authors: 

91 a = cleanup_str(a) 

92 if len(a) > 0: 92 ↛ 90line 92 didn't jump to line 90 because the condition on line 92 was always true

93 xarticle.contributors.append( 

94 create_contributor(role="author", string_name=a) 

95 ) 

96 

97 # Pages 

98 if article_data["firstpage"] and article_data["lastpage"]: 98 ↛ 103line 98 didn't jump to line 103 because the condition on line 98 was always true

99 xarticle.fpage = article_data["firstpage"] 

100 xarticle.lpage = article_data["lastpage"] 

101 

102 # Link 

103 ext_link = create_extlink( 

104 rel="source", location=xissue.url, metadata=self.source_domain 

105 ) 

106 xarticle.ext_links.append(ext_link) 

107 

108 # Title 

109 xarticle.title_tex = article_data["title"] 

110 # PDF 

111 pdf_tag = tag.find("a", text="[PDF]") 

112 if not isinstance(pdf_tag, Tag): 112 ↛ 113line 112 didn't jump to line 113 because the condition on line 112 was never true

113 raise ValueError(f"{self.source_domain}] {xissue.pid} Cannot find pdf link") 

114 pdf_link = pdf_tag.get("href") 

115 if not isinstance(pdf_link, str): 115 ↛ 116line 115 didn't jump to line 116 because the condition on line 115 was never true

116 raise ValueError(f"{self.source_domain}] {xissue.pid} Cannot parse pdf link") 

117 add_pdf_link_to_xarticle(xarticle, "https://scm.org.co" + pdf_link) 

118 

119 xissue.articles.append(xarticle)