Coverage for src/crawler/by_source/rcm_crawler.py: 82%
81 statements
« prev ^ index » next coverage.py v7.6.4, created at 2025-01-15 14:09 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2025-01-15 14:09 +0000
1import regex
2from bs4 import BeautifulSoup, Tag
3from lingua import Language, LanguageDetectorBuilder
4from ptf.model_data import IssueData, create_articledata, create_contributor, create_extlink
6from crawler.base_crawler import BaseCollectionCrawler
7from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
10class RcmCrawler(BaseCollectionCrawler):
11 source_name = "Sociedad Colombiana de Matemáticas"
12 source_domain = "RCM"
13 source_website = "https://scm.org.co/web/publicaciones/"
15 base_url = "https://scm.org.co/archivos/revista/index.php"
17 issue_regex = regex.compile(r"\((?P<year>\d+)\) Vol\. (?P<volume>\d+) No\. (?P<number>.*)")
18 article_regex = regex.compile(
19 r"(?P<authors>.+(?:&.+)?)?(?:,)(?P<title>.+) REVISTA COLOMBIANA DE MATEMÁTICAS.+Páginas (?:(?P<firstpage>\d+) ?--? ?(?P<lastpage>\d+))?.+Formato \[PDF\] \((?P<size>\d+) K\)"
20 )
22 def build_language_detector(self):
23 self.language_detector = LanguageDetectorBuilder.from_languages(
24 Language.ENGLISH, Language.SERBIAN
25 ).build()
27 def parse_collection_content(self, content):
28 soup = BeautifulSoup(content, "html.parser")
29 xissues: list[IssueData] = []
31 issue_tags = soup.select("table td")
32 for tag in issue_tags:
33 issue_str = cleanup_str(tag.text)
34 issue_match = self.issue_regex.search(issue_str)
35 if not issue_match:
36 continue
37 issue_data = issue_match.groupdict()
38 href_tag = tag.select_one("a")
39 if not href_tag: 39 ↛ 40line 39 didn't jump to line 40 because the condition on line 39 was never true
40 raise ValueError("Cannot find issue url")
41 href = href_tag.get("href")
42 if not isinstance(href, str): 42 ↛ 43line 42 didn't jump to line 43 because the condition on line 42 was never true
43 raise ValueError("Cannot parse issue url")
44 if href.endswith("&Vol=&Num="): 44 ↛ 45line 44 didn't jump to line 45 because the condition on line 44 was never true
45 continue
46 href = self.base_url + href
48 xissues.append(
49 self.create_xissue(
50 href, issue_data["year"], issue_data["volume"], issue_data["number"]
51 )
52 )
54 return xissues
56 def parse_issue_content(self, content, xissue: IssueData, index: int = 0):
57 soup = BeautifulSoup(content, "html.parser")
58 articles_tags = soup.select("div.wpb_content_element > p")
59 articles_tags.pop(0)
60 index = 0
61 if not xissue.url: 61 ↛ 62line 61 didn't jump to line 62 because the condition on line 61 was never true
62 raise ValueError("Issue url is not set")
63 for tag in articles_tags:
64 if tag.text == "": 64 ↛ 65line 64 didn't jump to line 65 because the condition on line 64 was never true
65 continue
66 text = cleanup_str(tag.text)
67 article_match = self.article_regex.search(text)
68 pid = f"{xissue.pid}_a{index}"
69 index += 1
70 if not article_match: 70 ↛ 71line 70 didn't jump to line 71 because the condition on line 70 was never true
71 print(
72 f"[{self.source_domain}] {xissue.pid} : Cannot parse title. Skipping Article {pid}"
73 )
74 continue
75 article_data = article_match.groupdict()
77 if article_data["size"] == "0":
78 print(
79 f"[{self.source_domain}] {xissue.pid} : Article has no pdf. Skipping Article {pid}"
80 )
81 continue
83 xarticle = create_articledata()
84 xarticle.pid = pid
86 # Authors
87 if article_data["authors"]: 87 ↛ 99line 87 didn't jump to line 99 because the condition on line 87 was always true
88 authors = article_data["authors"].replace("&", ",")
89 authors = regex.sub(r"\(.*\)", "", authors)
90 authors = authors.split(",")
91 for a in authors:
92 a = cleanup_str(a)
93 if len(a) > 0: 93 ↛ 91line 93 didn't jump to line 91 because the condition on line 93 was always true
94 xarticle.contributors.append(
95 create_contributor(role="author", string_name=a)
96 )
98 # Pages
99 if article_data["firstpage"] and article_data["lastpage"]: 99 ↛ 104line 99 didn't jump to line 104 because the condition on line 99 was always true
100 xarticle.fpage = article_data["firstpage"]
101 xarticle.lpage = article_data["lastpage"]
103 # Link
104 ext_link = create_extlink(
105 rel="source", location=xissue.url, metadata=self.source_domain
106 )
107 xarticle.ext_links.append(ext_link)
109 # Title
110 xarticle.title_tex = article_data["title"]
111 # PDF
112 pdf_tag = tag.find("a", text="[PDF]")
113 if not isinstance(pdf_tag, Tag): 113 ↛ 114line 113 didn't jump to line 114 because the condition on line 113 was never true
114 raise ValueError(f"{self.source_domain}] {xissue.pid} Cannot find pdf link")
115 pdf_link = pdf_tag.get("href")
116 if not isinstance(pdf_link, str): 116 ↛ 117line 116 didn't jump to line 117 because the condition on line 116 was never true
117 raise ValueError(f"{self.source_domain}] {xissue.pid} Cannot parse pdf link")
118 add_pdf_link_to_xarticle(xarticle, "https://scm.org.co" + pdf_link)
120 xissue.articles.append(xarticle)