Coverage for src/crawler/by_source/rcm_crawler.py: 82%
80 statements
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:36 +0000
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:36 +0000
1import regex
2from bs4 import BeautifulSoup, Tag
3from lingua import Language, LanguageDetectorBuilder
4from ptf.model_data import IssueData, create_articledata, create_contributor, create_extlink
6from crawler.base_crawler import BaseCollectionCrawler
7from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
10class RcmCrawler(BaseCollectionCrawler):
11 source_name = "Sociedad Colombiana de Matemáticas"
12 source_domain = "RCM"
13 source_website = "https://scm.org.co/web/publicaciones/"
15 base_url = "https://scm.org.co/archivos/revista/index.php"
17 issue_regex = regex.compile(r"\((?P<year>\d+)\) Vol\. (?P<volume>\d+) No\. (?P<number>.*)")
18 article_regex = regex.compile(
19 r"(?P<authors>.+(?:&.+)?)?(?:,)(?P<title>.+) REVISTA COLOMBIANA DE MATEMÁTICAS.+Páginas (?:(?P<firstpage>\d+) ?--? ?(?P<lastpage>\d+))?.+Formato \[PDF\] \((?P<size>\d+) K\)"
20 )
22 language_detector = LanguageDetectorBuilder.from_languages(
23 Language.ENGLISH, Language.SERBIAN
24 ).build()
26 def parse_collection_content(self, content):
27 soup = BeautifulSoup(content, "html.parser")
28 xissues: list[IssueData] = []
30 issue_tags = soup.select("table td")
31 for tag in issue_tags:
32 issue_str = cleanup_str(tag.text)
33 issue_match = self.issue_regex.search(issue_str)
34 if not issue_match:
35 continue
36 issue_data = issue_match.groupdict()
37 href_tag = tag.select_one("a")
38 if not href_tag: 38 ↛ 39line 38 didn't jump to line 39 because the condition on line 38 was never true
39 raise ValueError("Cannot find issue url")
40 href = href_tag.get("href")
41 if not isinstance(href, str): 41 ↛ 42line 41 didn't jump to line 42 because the condition on line 41 was never true
42 raise ValueError("Cannot parse issue url")
43 if href.endswith("&Vol=&Num="): 43 ↛ 44line 43 didn't jump to line 44 because the condition on line 43 was never true
44 continue
45 href = self.base_url + href
47 xissues.append(
48 self.create_xissue(
49 href, issue_data["year"], issue_data["volume"], issue_data["number"]
50 )
51 )
53 return xissues
55 def parse_issue_content(self, content, xissue: IssueData, index: int = 0):
56 soup = BeautifulSoup(content, "html.parser")
57 articles_tags = soup.select("div.wpb_content_element > p")
58 articles_tags.pop(0)
59 index = 0
60 if not xissue.url: 60 ↛ 61line 60 didn't jump to line 61 because the condition on line 60 was never true
61 raise ValueError("Issue url is not set")
62 for tag in articles_tags:
63 if tag.text == "": 63 ↛ 64line 63 didn't jump to line 64 because the condition on line 63 was never true
64 continue
65 text = cleanup_str(tag.text)
66 article_match = self.article_regex.search(text)
67 pid = f"{xissue.pid}_a{index}"
68 index += 1
69 if not article_match: 69 ↛ 70line 69 didn't jump to line 70 because the condition on line 69 was never true
70 print(
71 f"[{self.source_domain}] {xissue.pid} : Cannot parse title. Skipping Article {pid}"
72 )
73 continue
74 article_data = article_match.groupdict()
76 if article_data["size"] == "0":
77 print(
78 f"[{self.source_domain}] {xissue.pid} : Article has no pdf. Skipping Article {pid}"
79 )
80 continue
82 xarticle = create_articledata()
83 xarticle.pid = pid
85 # Authors
86 if article_data["authors"]: 86 ↛ 98line 86 didn't jump to line 98 because the condition on line 86 was always true
87 authors = article_data["authors"].replace("&", ",")
88 authors = regex.sub(r"\(.*\)", "", authors)
89 authors = authors.split(",")
90 for a in authors:
91 a = cleanup_str(a)
92 if len(a) > 0: 92 ↛ 90line 92 didn't jump to line 90 because the condition on line 92 was always true
93 xarticle.contributors.append(
94 create_contributor(role="author", string_name=a)
95 )
97 # Pages
98 if article_data["firstpage"] and article_data["lastpage"]: 98 ↛ 103line 98 didn't jump to line 103 because the condition on line 98 was always true
99 xarticle.fpage = article_data["firstpage"]
100 xarticle.lpage = article_data["lastpage"]
102 # Link
103 ext_link = create_extlink(
104 rel="source", location=xissue.url, metadata=self.source_domain
105 )
106 xarticle.ext_links.append(ext_link)
108 # Title
109 xarticle.title_tex = article_data["title"]
110 # PDF
111 pdf_tag = tag.find("a", text="[PDF]")
112 if not isinstance(pdf_tag, Tag): 112 ↛ 113line 112 didn't jump to line 113 because the condition on line 112 was never true
113 raise ValueError(f"{self.source_domain}] {xissue.pid} Cannot find pdf link")
114 pdf_link = pdf_tag.get("href")
115 if not isinstance(pdf_link, str): 115 ↛ 116line 115 didn't jump to line 116 because the condition on line 115 was never true
116 raise ValueError(f"{self.source_domain}] {xissue.pid} Cannot parse pdf link")
117 add_pdf_link_to_xarticle(xarticle, "https://scm.org.co" + pdf_link)
119 xissue.articles.append(xarticle)