Coverage for src/crawler/by_source/rcm

1import regex

2from bs4 import BeautifulSoup, Tag

3from lingua import Language, LanguageDetectorBuilder

4from ptf.model_data import IssueData, create_articledata, create_contributor, create_extlink

6from crawler.base_crawler import BaseCollectionCrawler

7from crawler.utils import add_pdf_link_to_xarticle, cleanup_str

10class RcmCrawler(BaseCollectionCrawler):

11 source_name = "Sociedad Colombiana de Matemáticas"

12 source_domain = "RCM"

13 source_website = "https://scm.org.co/web/publicaciones/"

15 base_url = "https://scm.org.co/archivos/revista/index.php"

17 issue_regex = regex.compile(r"\((?P<year>\d+)\) Vol\. (?P<volume>\d+) No\. (?P<number>.*)")

18 article_regex = regex.compile(

19 r"(?P<authors>.+(?:&.+)?)?(?:,)(?P<title>.+) REVISTA COLOMBIANA DE MATEMÁTICAS.+Páginas (?:(?P<firstpage>\d+) ?--? ?(?P<lastpage>\d+))?.+Formato \[PDF\] \((?P<size>\d+) K\)"

20 )

22 def initialize(self):

23 self.language_detector = LanguageDetectorBuilder.from_languages(

24 Language.ENGLISH, Language.SERBIAN

25 ).build()

27 def parse_collection_content(self, content):

28 soup = BeautifulSoup(content, "html.parser")

29 xissues: list[IssueData] = []

31 issue_tags = soup.select("table td")

32 for tag in issue_tags:

33 issue_str = cleanup_str(tag.text)

34 issue_match = self.issue_regex.search(issue_str)

35 if not issue_match:

36 continue

37 issue_data = issue_match.groupdict()

38 href_tag = tag.select_one("a")

39 if not href_tag: 39 ↛ 40line 39 didn't jump to line 40 because the condition on line 39 was never true

40 raise ValueError("Cannot find issue url")

41 href = href_tag.get("href")

42 if not isinstance(href, str): 42 ↛ 43line 42 didn't jump to line 43 because the condition on line 42 was never true

43 raise ValueError("Cannot parse issue url")

44 if href.endswith("&Vol=&Num="): 44 ↛ 45line 44 didn't jump to line 45 because the condition on line 44 was never true

45 continue

46 href = self.base_url + href

48 xissues.append(

49 self.create_xissue(

50 href, issue_data["year"], issue_data["volume"], issue_data["number"]

51 )

52 )

54 return xissues

56 def parse_issue_content(self, content, xissue: IssueData, index: int = 0):

57 soup = BeautifulSoup(content, "html.parser")

58 articles_tags = soup.select("div.wpb_content_element > p")

59 articles_tags.pop(0)

60 index = 0

61 if not xissue.url: 61 ↛ 62line 61 didn't jump to line 62 because the condition on line 61 was never true

62 raise ValueError("Issue url is not set")

63 for tag in articles_tags:

64 if tag.text == "": 64 ↛ 65line 64 didn't jump to line 65 because the condition on line 64 was never true

65 continue

66 text = cleanup_str(tag.text)

67 article_match = self.article_regex.search(text)

68 pid = f"{xissue.pid}_a{index}"

69 index += 1

70 if not article_match: 70 ↛ 71line 70 didn't jump to line 71 because the condition on line 70 was never true

71 print(

72 f"[{self.source_domain}] {xissue.pid} : Cannot parse title. Skipping Article {pid}"

73 )

74 continue

75 article_data = article_match.groupdict()

77 if article_data["size"] == "0":

78 print(

79 f"[{self.source_domain}] {xissue.pid} : Article has no pdf. Skipping Article {pid}"

80 )

81 continue

83 xarticle = create_articledata()

84 xarticle.pid = pid

86 # Authors

87 if article_data["authors"]: 87 ↛ 99line 87 didn't jump to line 99 because the condition on line 87 was always true

88 authors = article_data["authors"].replace("&", ",")

89 authors = regex.sub(r"\(.*\)", "", authors)

90 authors = authors.split(",")

91 for a in authors:

92 a = cleanup_str(a)

93 if len(a) > 0: 93 ↛ 91line 93 didn't jump to line 91 because the condition on line 93 was always true

94 xarticle.contributors.append(

95 create_contributor(role="author", string_name=a)

96 )

98 # Pages

99 if article_data["firstpage"] and article_data["lastpage"]: 99 ↛ 104line 99 didn't jump to line 104 because the condition on line 99 was always true

100 xarticle.fpage = article_data["firstpage"]

101 xarticle.lpage = article_data["lastpage"]

102

103 # Link

104 ext_link = create_extlink(

105 rel="source", location=xissue.url, metadata=self.source_domain

106 )

107 xarticle.ext_links.append(ext_link)

108

109 # Title

110 xarticle.title_tex = article_data["title"]

111 # PDF

112 pdf_tag = tag.find("a", text="[PDF]")

113 if not isinstance(pdf_tag, Tag): 113 ↛ 114line 113 didn't jump to line 114 because the condition on line 113 was never true

114 raise ValueError(f"{self.source_domain}] {xissue.pid} Cannot find pdf link")

115 pdf_link = pdf_tag.get("href")

116 if not isinstance(pdf_link, str): 116 ↛ 117line 116 didn't jump to line 117 because the condition on line 116 was never true

117 raise ValueError(f"{self.source_domain}] {xissue.pid} Cannot parse pdf link")

118 add_pdf_link_to_xarticle(xarticle, "https://scm.org.co" + pdf_link)

119

120 xissue.articles.append(xarticle)

Coverage for src/crawler/by_source/rcm_crawler.py: 81%

81 statements