Coverage for src / crawler / by_source / bmms_crawler.py: 63%
139 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-11 14:57 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-11 14:57 +0000
1import re
2from urllib.parse import urljoin
4import requests
5from bs4 import BeautifulSoup
6from ptf.model_data import (
7 create_abstract,
8 create_articledata,
9 create_contributor,
10 create_subj,
11)
13from crawler.base_crawler import BaseCollectionCrawler
14from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
17class BmmsCrawler(BaseCollectionCrawler):
18 source_name = "BMMS"
19 source_domain = "BMMS"
20 source_website = "https://math.usm.my/bulletin/html/research.htm"
22 issue_re = r"Volume (?P<volume>\d+)\s*•\s*Number (?P<number>\d+)\s*•\s*(?P<year>\d{4})"
23 issue_re_bis = r"V(?P<volume>\d+)\s*•\s*N(?P<number>\d+[A-Z]?)\s*•\s*(?P<year>\d{4})"
24 abstract_re = r"(?<=Abstract\.)(.*?)(?=[\d]{4}.*Mathematics Subject Classification)"
25 abstract_no_msc_re = r"(?<=Abstract\.)(.*?)(?=Full.*text.*PDF)"
26 msc_re = r"(?<=Mathematics Subject Classification:)(.*?)(?=Full)"
27 delimiter_disp_formula = {"\\begin{align*}", "$$"}
28 delimiter_inline_formula = {"$", "\\(", "\\["}
30 def get_authors(self, parent_tag, title_tag, title_text):
31 """parent_tag is a soup that contains the authors
32 returns the string list of the authors
33 """
34 if "Linear Transformations of N-connections in OSC" in cleanup_str(title_text): 34 ↛ 35line 34 didn't jump to line 35 because the condition on line 34 was never true
35 return "Irena Čomić and Monica Purcaru"
36 if ( 36 ↛ 40line 36 didn't jump to line 40 because the condition on line 36 was never true
37 "Mapping fromTopological Space to Endomorphisms Algebra of Banach Space and its Applications"
38 in cleanup_str(title_text)
39 ):
40 return "Misir B. Ragimov"
41 authors_tag = parent_tag.select_one("span.justify2")
42 if authors_tag: 42 ↛ 70line 42 didn't jump to line 70 because the condition on line 42 was always true
43 authors_string = cleanup_str(authors_tag.get_text())
44 if cleanup_str(title_text) == authors_string: 44 ↛ 45line 44 didn't jump to line 45 because the condition on line 44 was never true
45 authors_tags = parent_tag.select("span.justify2")
46 for span in authors_tags:
47 if not span.find("i"):
48 authors_string = span.string
49 break
51 if "Abstract" in authors_string: 51 ↛ 52line 51 didn't jump to line 52 because the condition on line 51 was never true
52 return authors_string.split("Abstract")[0]
53 elif title_text in authors_string: 53 ↛ 54line 53 didn't jump to line 54 because the condition on line 53 was never true
54 res = authors_string.split(title_text)[1]
55 return res
56 else:
57 if authors_string != "": 57 ↛ 59line 57 didn't jump to line 59 because the condition on line 57 was always true
58 return authors_string
59 authors_tag = parent_tag.select_one("span.justify2 span.justify2 span.justify2")
60 if authors_tag:
61 authors_string = authors_tag.get_text()
62 if authors_string != "":
63 return authors_string
64 authors_tag = parent_tag.select_one("span.justify2 span.justify2")
65 if authors_tag:
66 authors_string = authors_tag.get_text()
67 if authors_string != "":
68 return authors_string
70 authors_tag = parent_tag.select_one("span.text2")
71 if authors_tag:
72 return authors_tag.get_text()
73 else:
74 br_tag = title_tag.find_next("br")
75 if br_tag and br_tag.next_sibling:
76 return br_tag.next_sibling.strip()
78 def parse_collection_content(self, content):
79 xissues = []
80 soup = BeautifulSoup(content, "html.parser")
81 issues = soup.select("a.title")
82 for issue in issues:
83 issue_search = re.search(self.issue_re, issue.text)
84 if not issue_search:
85 issue_search = re.search(self.issue_re_bis, issue.text)
86 if not issue_search: 86 ↛ 87line 86 didn't jump to line 87 because the condition on line 86 was never true
87 raise ValueError("Couldn't parse issue data")
88 issue_dict = issue_search.groupdict()
89 issue_href = issue.get("href")
90 if not isinstance(issue_href, str): 90 ↛ 91line 90 didn't jump to line 91 because the condition on line 90 was never true
91 raise ValueError("Couldn't parse issue url")
93 xissues.append(
94 self.create_xissue(
95 urljoin(self.source_website, issue_href),
96 issue_dict["year"],
97 issue_dict["volume"],
98 issue_dict.get("number"),
99 )
100 )
102 return xissues
104 def parse_issue_content(self, content, xissue):
105 soup = BeautifulSoup(content, "html.parser")
106 articles = soup.select("a.title")
107 for index, article_tag in enumerate(articles):
108 xarticle = create_articledata()
109 xarticle.pid = "a" + str(index)
110 article_href = article_tag.get("href")
111 if not isinstance(article_href, str): 111 ↛ 112line 111 didn't jump to line 112 because the condition on line 111 was never true
112 raise ValueError("Couldn't parse article href")
113 xarticle.url = urljoin(self.source_website, article_href)
114 xissue.articles.append(xarticle)
116 def parse_article_content(self, content, xissue, xarticle, url):
117 f = open("/tmp/" + xissue.pid, "a")
118 # print("Article Id: ", xarticle.pid)
119 f.write(xarticle.pid + "\n")
120 soup = BeautifulSoup(content, "html5lib")
121 # Title
122 title_tags = soup.select("i.title, span.title")
123 title_tag = title_tags[0]
124 title_text = title_tag.get_text(strip=True).lstrip("• ").strip()
125 if title_text == "":
126 title_tag = title_tags[1]
127 title_text = title_tag.get_text(strip=True).lstrip("• ").strip()
128 # print("title: ", cleanup_str(title_text))
129 if isinstance(title_text, str): 129 ↛ 132line 129 didn't jump to line 132 because the condition on line 129 was always true
130 xarticle.title_tex = title_text
131 # PDF
132 pdf_link_tag = soup.select("a[href$='.pdf']:-soup-contains-own('PDF')")
133 if len(pdf_link_tag) != 1: 133 ↛ 134line 133 didn't jump to line 134 because the condition on line 133 was never true
134 raise ValueError("Error while trying to parse pdf url : found multiple candidates")
135 pdf_link = pdf_link_tag[0].get("href")
136 if not isinstance(pdf_link, str): 136 ↛ 137line 136 didn't jump to line 137 because the condition on line 136 was never true
137 raise ValueError("Couldn't parse article pdf")
138 pdf_url = urljoin(self.source_website, pdf_link)
139 f.write(pdf_url + "\n")
140 add_pdf_link_to_xarticle(xarticle, pdf_url)
142 # abstract with authors and msc
143 # fetch the span with "Abstract."
144 abstract_span = soup.find("span", string=lambda s: s and "Abstract." in s)
145 # fetch the parent with the all text
146 if abstract_span: 146 ↛ 175line 146 didn't jump to line 175 because the condition on line 146 was always true
147 parent = abstract_span.find_parent("table")
148 authors_string = self.get_authors(parent, title_tag, cleanup_str(title_text)).replace(
149 " and ", ", "
150 )
151 authors = authors_string.split(", ")
152 # print("AUTHORS: ", authors)
153 for author in authors:
154 xarticle.contributors.append(
155 create_contributor(string_name=cleanup_str(author), role="author")
156 )
157 # ABSTRACT
158 full_text = parent.get_text(separator=" ", strip=True).replace("\n", " ")
159 abstract_search = re.search(self.abstract_re, full_text)
160 if abstract_search:
161 abstract = abstract_search.group()
162 # MSC
163 mcs_search = re.search(self.msc_re, full_text)
164 mcs_string = mcs_search.group()
165 msc_list = mcs_string.split(", ")
166 for msc in msc_list:
167 xarticle.kwds.append(create_subj(type="msc", value=cleanup_str(msc)))
168 else:
169 # No msc, we have to define the abstract tag without the msc string
170 abstract_search = re.search(self.abstract_no_msc_re, full_text)
171 abstract = abstract_search.group()
172 # print("ABSTRACT:", abstract)
173 xarticle.abstracts.append(create_abstract(lang="en", value_tex=abstract))
174 else:
175 print("Abstract not found.")
176 return None
178 return xarticle
180 def crawl_article(self, xarticle, xissue):
181 try:
182 return super().crawl_article(xarticle, xissue)
183 except requests.exceptions.HTTPError as e:
184 status_code = e.response.status_code
185 if status_code == 404:
186 self.logger.warning(e)
187 return None
188 raise e
190 def decode_response(self, response, encoding="ISO-8859-1"):
191 """Force encoding"""
192 try:
193 return super().decode_response(response, "ISO-8859-1")
194 except UnicodeDecodeError:
195 self.logger.debug(
196 f"Cannot parse resource using {encoding}. Attempting ISO-8859-1",
197 extra={"url": response.url},
198 )