Coverage for src/crawler/by_source/bmms_crawler.py: 65%
129 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-11-21 14:41 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-11-21 14:41 +0000
1import re
2from urllib.parse import urljoin
4from bs4 import BeautifulSoup
5from ptf.model_data import create_abstract, create_articledata, create_contributor, create_subj
7from crawler.base_crawler import BaseCollectionCrawler
8from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
11class BmmsCrawler(BaseCollectionCrawler):
12 source_name = "BMMS"
13 source_domain = "BMMS"
14 source_website = "https://math.usm.my/bulletin/html/research.htm"
16 issue_re = r"Volume (?P<volume>\d+)\s*•\s*Number (?P<number>\d+)\s*•\s*(?P<year>\d{4})"
17 issue_re_bis = r"V(?P<volume>\d+)\s*•\s*N(?P<number>\d+[A-Z]?)\s*•\s*(?P<year>\d{4})"
18 abstract_re = r"(?<=Abstract\.)(.*?)(?=[\d]{4}.*Mathematics Subject Classification)"
19 abstract_no_msc_re = r"(?<=Abstract\.)(.*?)(?=Full.*text.*PDF)"
20 msc_re = r"(?<=Mathematics Subject Classification:)(.*?)(?=Full)"
21 delimiter_disp_formula = {"\\begin{align*}", "$$"}
22 delimiter_inline_formula = {"$", "\\(", "\\["}
24 def get_authors(self, parent_tag, title_tag, title_text):
25 """parent_tag is a soup that contains the authors
26 returns the string list of the authors
27 """
28 if "Linear Transformations of N-connections in OSC" in cleanup_str(title_text): 28 ↛ 29line 28 didn't jump to line 29 because the condition on line 28 was never true
29 return "Irena Čomić and Monica Purcaru"
30 if ( 30 ↛ 34line 30 didn't jump to line 34 because the condition on line 30 was never true
31 "Mapping fromTopological Space to Endomorphisms Algebra of Banach Space and its Applications"
32 in cleanup_str(title_text)
33 ):
34 return "Misir B. Ragimov"
35 authors_tag = parent_tag.select_one("span.justify2")
36 if authors_tag: 36 ↛ 64line 36 didn't jump to line 64 because the condition on line 36 was always true
37 authors_string = cleanup_str(authors_tag.get_text())
38 if cleanup_str(title_text) == authors_string: 38 ↛ 39line 38 didn't jump to line 39 because the condition on line 38 was never true
39 authors_tags = parent_tag.select("span.justify2")
40 for span in authors_tags:
41 if not span.find("i"):
42 authors_string = span.string
43 break
45 if "Abstract" in authors_string: 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true
46 return authors_string.split("Abstract")[0]
47 elif title_text in authors_string: 47 ↛ 48line 47 didn't jump to line 48 because the condition on line 47 was never true
48 res = authors_string.split(title_text)[1]
49 return res
50 else:
51 if authors_string != "": 51 ↛ 53line 51 didn't jump to line 53 because the condition on line 51 was always true
52 return authors_string
53 authors_tag = parent_tag.select_one("span.justify2 span.justify2 span.justify2")
54 if authors_tag:
55 authors_string = authors_tag.get_text()
56 if authors_string != "":
57 return authors_string
58 authors_tag = parent_tag.select_one("span.justify2 span.justify2")
59 if authors_tag:
60 authors_string = authors_tag.get_text()
61 if authors_string != "":
62 return authors_string
64 authors_tag = parent_tag.select_one("span.text2")
65 if authors_tag:
66 return authors_tag.get_text()
67 else:
68 br_tag = title_tag.find_next("br")
69 if br_tag and br_tag.next_sibling:
70 return br_tag.next_sibling.strip()
72 def parse_collection_content(self, content):
73 xissues = []
74 soup = BeautifulSoup(content, "html.parser")
75 issues = soup.select("a.title")
76 for issue in issues:
77 issue_search = re.search(self.issue_re, issue.text)
78 if not issue_search:
79 issue_search = re.search(self.issue_re_bis, issue.text)
80 if not issue_search: 80 ↛ 81line 80 didn't jump to line 81 because the condition on line 80 was never true
81 raise ValueError("Couldn't parse issue data")
82 issue_dict = issue_search.groupdict()
83 issue_href = issue.get("href")
84 if not isinstance(issue_href, str): 84 ↛ 85line 84 didn't jump to line 85 because the condition on line 84 was never true
85 raise ValueError("Couldn't parse issue url")
87 xissues.append(
88 self.create_xissue(
89 urljoin(self.source_website, issue_href),
90 issue_dict["year"],
91 issue_dict["volume"],
92 issue_dict.get("number"),
93 )
94 )
96 return xissues
98 def parse_issue_content(self, content, xissue):
99 soup = BeautifulSoup(content, "html.parser")
100 articles = soup.select("a.title")
101 for index, article_tag in enumerate(articles):
102 xarticle = create_articledata()
103 xarticle.pid = "a" + str(index)
104 article_href = article_tag.get("href")
105 if not isinstance(article_href, str): 105 ↛ 106line 105 didn't jump to line 106 because the condition on line 105 was never true
106 raise ValueError("Couldn't parse article href")
107 xarticle.url = urljoin(self.source_website, article_href)
108 xissue.articles.append(xarticle)
110 def parse_article_content(self, content, xissue, xarticle, url):
111 f = open("/tmp/" + xissue.pid, "a")
112 # print("Article Id: ", xarticle.pid)
113 f.write(xarticle.pid + "\n")
114 soup = BeautifulSoup(content, "html5lib")
115 # Title
116 title_tags = soup.select("i.title, span.title")
117 title_tag = title_tags[0]
118 title_text = title_tag.get_text(strip=True).lstrip("• ").strip()
119 if title_text == "":
120 title_tag = title_tags[1]
121 title_text = title_tag.get_text(strip=True).lstrip("• ").strip()
122 # print("title: ", cleanup_str(title_text))
123 if isinstance(title_text, str): 123 ↛ 126line 123 didn't jump to line 126 because the condition on line 123 was always true
124 xarticle.title_tex = title_text
125 # PDF
126 pdf_link_tag = soup.select("a[href$='.pdf']:-soup-contains-own('PDF')")
127 if len(pdf_link_tag) != 1: 127 ↛ 128line 127 didn't jump to line 128 because the condition on line 127 was never true
128 raise ValueError("Error while trying to parse pdf url : found multiple candidates")
129 pdf_link = pdf_link_tag[0].get("href")
130 if not isinstance(pdf_link, str): 130 ↛ 131line 130 didn't jump to line 131 because the condition on line 130 was never true
131 raise ValueError("Couldn't parse article pdf")
132 pdf_url = urljoin(self.source_website, pdf_link)
133 f.write(pdf_url + "\n")
134 add_pdf_link_to_xarticle(xarticle, pdf_url)
136 # abstract with authors and msc
137 # fetch the span with "Abstract."
138 abstract_span = soup.find("span", string=lambda s: s and "Abstract." in s)
139 # fetch the parent with the all text
140 if abstract_span: 140 ↛ 169line 140 didn't jump to line 169 because the condition on line 140 was always true
141 parent = abstract_span.find_parent("table")
142 authors_string = self.get_authors(parent, title_tag, cleanup_str(title_text)).replace(
143 " and ", ", "
144 )
145 authors = authors_string.split(", ")
146 # print("AUTHORS: ", authors)
147 for author in authors:
148 xarticle.contributors.append(
149 create_contributor(string_name=cleanup_str(author), role="author")
150 )
151 # ABSTRACT
152 full_text = parent.get_text(separator=" ", strip=True).replace("\n", " ")
153 abstract_search = re.search(self.abstract_re, full_text)
154 if abstract_search:
155 abstract = abstract_search.group()
156 # MSC
157 mcs_search = re.search(self.msc_re, full_text)
158 mcs_string = mcs_search.group()
159 msc_list = mcs_string.split(", ")
160 for msc in msc_list:
161 xarticle.kwds.append(create_subj(type="msc", value=cleanup_str(msc)))
162 else:
163 # No msc, we have to define the abstract tag without the msc string
164 abstract_search = re.search(self.abstract_no_msc_re, full_text)
165 abstract = abstract_search.group()
166 # print("ABSTRACT:", abstract)
167 xarticle.abstracts.append(create_abstract(lang="en", value_tex=abstract))
168 else:
169 print("Abstract not found.")
170 return None
172 return xarticle
174 def decode_response(self, response, encoding="ISO-8859-1"):
175 """Force encoding"""
176 try:
177 return super().decode_response(response, "ISO-8859-1")
178 except UnicodeDecodeError:
179 self.logger.debug(
180 f"Cannot parse resource using {encoding}. Attempting ISO-8859-1",
181 extra={"url": response.url},
182 )