Coverage for src/crawler/by_source/rcm_crawler.py: 82%

81 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2025-01-15 14:09 +0000

1import regex 

2from bs4 import BeautifulSoup, Tag 

3from lingua import Language, LanguageDetectorBuilder 

4from ptf.model_data import IssueData, create_articledata, create_contributor, create_extlink 

5 

6from crawler.base_crawler import BaseCollectionCrawler 

7from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

8 

9 

10class RcmCrawler(BaseCollectionCrawler): 

11 source_name = "Sociedad Colombiana de Matemáticas" 

12 source_domain = "RCM" 

13 source_website = "https://scm.org.co/web/publicaciones/" 

14 

15 base_url = "https://scm.org.co/archivos/revista/index.php" 

16 

17 issue_regex = regex.compile(r"\((?P<year>\d+)\) Vol\. (?P<volume>\d+) No\. (?P<number>.*)") 

18 article_regex = regex.compile( 

19 r"(?P<authors>.+(?:&.+)?)?(?:,)(?P<title>.+) REVISTA COLOMBIANA DE MATEMÁTICAS.+Páginas (?:(?P<firstpage>\d+) ?--? ?(?P<lastpage>\d+))?.+Formato \[PDF\] \((?P<size>\d+) K\)" 

20 ) 

21 

22 def build_language_detector(self): 

23 self.language_detector = LanguageDetectorBuilder.from_languages( 

24 Language.ENGLISH, Language.SERBIAN 

25 ).build() 

26 

27 def parse_collection_content(self, content): 

28 soup = BeautifulSoup(content, "html.parser") 

29 xissues: list[IssueData] = [] 

30 

31 issue_tags = soup.select("table td") 

32 for tag in issue_tags: 

33 issue_str = cleanup_str(tag.text) 

34 issue_match = self.issue_regex.search(issue_str) 

35 if not issue_match: 

36 continue 

37 issue_data = issue_match.groupdict() 

38 href_tag = tag.select_one("a") 

39 if not href_tag: 39 ↛ 40line 39 didn't jump to line 40 because the condition on line 39 was never true

40 raise ValueError("Cannot find issue url") 

41 href = href_tag.get("href") 

42 if not isinstance(href, str): 42 ↛ 43line 42 didn't jump to line 43 because the condition on line 42 was never true

43 raise ValueError("Cannot parse issue url") 

44 if href.endswith("&Vol=&Num="): 44 ↛ 45line 44 didn't jump to line 45 because the condition on line 44 was never true

45 continue 

46 href = self.base_url + href 

47 

48 xissues.append( 

49 self.create_xissue( 

50 href, issue_data["year"], issue_data["volume"], issue_data["number"] 

51 ) 

52 ) 

53 

54 return xissues 

55 

56 def parse_issue_content(self, content, xissue: IssueData, index: int = 0): 

57 soup = BeautifulSoup(content, "html.parser") 

58 articles_tags = soup.select("div.wpb_content_element > p") 

59 articles_tags.pop(0) 

60 index = 0 

61 if not xissue.url: 61 ↛ 62line 61 didn't jump to line 62 because the condition on line 61 was never true

62 raise ValueError("Issue url is not set") 

63 for tag in articles_tags: 

64 if tag.text == "": 64 ↛ 65line 64 didn't jump to line 65 because the condition on line 64 was never true

65 continue 

66 text = cleanup_str(tag.text) 

67 article_match = self.article_regex.search(text) 

68 pid = f"{xissue.pid}_a{index}" 

69 index += 1 

70 if not article_match: 70 ↛ 71line 70 didn't jump to line 71 because the condition on line 70 was never true

71 print( 

72 f"[{self.source_domain}] {xissue.pid} : Cannot parse title. Skipping Article {pid}" 

73 ) 

74 continue 

75 article_data = article_match.groupdict() 

76 

77 if article_data["size"] == "0": 

78 print( 

79 f"[{self.source_domain}] {xissue.pid} : Article has no pdf. Skipping Article {pid}" 

80 ) 

81 continue 

82 

83 xarticle = create_articledata() 

84 xarticle.pid = pid 

85 

86 # Authors 

87 if article_data["authors"]: 87 ↛ 99line 87 didn't jump to line 99 because the condition on line 87 was always true

88 authors = article_data["authors"].replace("&", ",") 

89 authors = regex.sub(r"\(.*\)", "", authors) 

90 authors = authors.split(",") 

91 for a in authors: 

92 a = cleanup_str(a) 

93 if len(a) > 0: 93 ↛ 91line 93 didn't jump to line 91 because the condition on line 93 was always true

94 xarticle.contributors.append( 

95 create_contributor(role="author", string_name=a) 

96 ) 

97 

98 # Pages 

99 if article_data["firstpage"] and article_data["lastpage"]: 99 ↛ 104line 99 didn't jump to line 104 because the condition on line 99 was always true

100 xarticle.fpage = article_data["firstpage"] 

101 xarticle.lpage = article_data["lastpage"] 

102 

103 # Link 

104 ext_link = create_extlink( 

105 rel="source", location=xissue.url, metadata=self.source_domain 

106 ) 

107 xarticle.ext_links.append(ext_link) 

108 

109 # Title 

110 xarticle.title_tex = article_data["title"] 

111 # PDF 

112 pdf_tag = tag.find("a", text="[PDF]") 

113 if not isinstance(pdf_tag, Tag): 113 ↛ 114line 113 didn't jump to line 114 because the condition on line 113 was never true

114 raise ValueError(f"{self.source_domain}] {xissue.pid} Cannot find pdf link") 

115 pdf_link = pdf_tag.get("href") 

116 if not isinstance(pdf_link, str): 116 ↛ 117line 116 didn't jump to line 117 because the condition on line 116 was never true

117 raise ValueError(f"{self.source_domain}] {xissue.pid} Cannot parse pdf link") 

118 add_pdf_link_to_xarticle(xarticle, "https://scm.org.co" + pdf_link) 

119 

120 xissue.articles.append(xarticle)