Coverage for src/crawler/by_source/emis_hoa_crawler.py: 13%
120 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-09-16 12:41 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-09-16 12:41 +0000
1from urllib.parse import urljoin
3import regex
4from bs4 import BeautifulSoup, Tag
5from ptf.cmds.xml.jats.builder.references import get_article_title_xml, get_ext_link_xml
6from ptf.cmds.xml.jats.jats_parser import JatsBase
7from ptf.cmds.xml.xml_utils import escape
8from ptf.model_data import (
9 IssueData,
10 create_abstract,
11 create_articledata,
12 create_contributor,
13)
15from crawler.base_crawler import BaseCollectionCrawler
16from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
19class Emis_hoaCrawler(BaseCollectionCrawler):
20 source_name = "European Mathematical Information Service"
21 source_domain = "EMIS_HOA"
22 source_website = "https://www.emis.de"
24 issue_re = regex.compile(
25 r"Volume (?P<volume>\d+) \((?P<year>\d+)\)(?:, Issue (?P<number>\d+))?"
26 )
27 doi_re = regex.compile(r"(?:doi:|http:\/\/dx\.doi\.org\/)(?P<doi>10\.[\w\.\/]+\/[\w\.\/]+)")
29 verify = False
31 @classmethod
32 def get_view_id(cls):
33 return "EMIS"
35 def parse_collection_content(self, content):
36 xissues = []
38 soup = BeautifulSoup(content, "html5lib")
39 a_tags = soup.select("a[href^='Volume']")
40 for a_tag in a_tags:
41 href = a_tag.get("href")
42 if not isinstance(href, str):
43 raise ValueError("Couldn't parse ")
44 xissues.extend(self.prefetch_aaa_issues(urljoin(self.collection_url, href)))
45 return xissues
47 # Volumes and issues are defined inside the issue pages
48 # No choice but to fetch everything in parse_collection_content
49 def prefetch_aaa_issues(self, link: str):
50 """
51 Parses one AAA Volume page and returns an iterable of issues
52 """
53 xissues_dict: dict[str, IssueData] = {}
54 content = self.download_file(link)
55 soup = BeautifulSoup(content, "html5lib")
56 article_tags = soup.select("#ctl00_ContentPlaceHolder1_LblArticles li")
57 for index, a_tag in enumerate(article_tags):
58 href = a_tag.select_one("a").get("href")
59 if not isinstance(href, str):
60 raise ValueError("Couldn't parse article link")
62 xarticle = create_articledata()
63 xarticle.pid = "a" + str(index)
64 xarticle.url = urljoin(link, href)
66 issue_search = self.issue_re.search(cleanup_str(a_tag.text))
67 if not issue_search:
68 raise ValueError("Couldn't parse issue data")
69 issue_data = issue_search.groupdict()
71 issue_pid = self.get_issue_pid(
72 self.collection_id,
73 issue_data["year"],
74 issue_data["volume"],
75 issue_data.get("number", None),
76 )
78 if issue_pid not in xissues_dict:
79 xissues_dict[issue_pid] = self.create_xissue(
80 link,
81 year=issue_data["year"],
82 volume_number=issue_data["volume"],
83 issue_number=issue_data.get("number", None),
84 )
86 xissues_dict[issue_pid].articles.append(xarticle)
87 return xissues_dict.values()
89 def parse_article_content(self, content, xissue, xarticle, url):
90 soup = BeautifulSoup(content, "html5lib")
92 title_tag = soup.select_one(".middle_content h2")
93 xarticle.title_tex = cleanup_str(title_tag.text)
95 authors_tag = soup.select_one(".middle_content h1, .middle_content .author_gp")
97 if authors_tag:
98 # Remove affiliations from author links
99 # RFE : parse author affiliations
100 sup_tags = authors_tag.select("sup")
101 for sup in sup_tags:
102 sup.decompose()
103 del sup_tags
105 authors_str = cleanup_str(authors_tag.text)
106 authors_str.replace(", and ", ", ")
107 for author in authors_str.split(", "):
108 if cleanup_str(author) == "":
109 raise ValueError("Invalid author")
110 xarticle.contributors.append(create_contributor(role="author", string_name=author))
112 doi_tag = soup.select_one(".middle_content pre")
113 if doi_tag:
114 doi_search = self.doi_re.search(doi_tag.text)
115 if doi_search:
116 doi = doi_search.group("doi")
117 xarticle.doi = doi
119 abstract_header = soup.select_one("h4:-soup-contains-own('Abstract')")
120 if abstract_header:
121 abstract_tag = abstract_header.parent.select_one("p")
122 if abstract_tag:
123 xarticle.abstracts.append(
124 create_abstract(
125 lang=xarticle.lang,
126 value_tex=escape(cleanup_str(abstract_tag.text)),
127 )
128 )
130 references_header = soup.select_one("h4:-soup-contains-own('References')")
131 if references_header:
132 references_tags = references_header.parent.select("ol > li")
134 for ref_tag in references_tags:
135 xarticle.bibitems.append(self.parse_ref(ref_tag))
137 pdf_tag = soup.select_one("a.full_text_pdf")
139 pdf_href = pdf_tag.get("href")
140 if not isinstance(pdf_href, str):
141 raise ValueError("Couldn't parse pdf href")
142 add_pdf_link_to_xarticle(xarticle, urljoin(url, pdf_href))
143 return xarticle
145 def parse_ref(self, tag: Tag):
146 value_xml = ""
147 for el in tag.children:
148 if isinstance(el, Tag):
149 if el.name == "i":
150 value_xml += get_article_title_xml(cleanup_str(el.text))
151 continue
153 if "reflinks" in (el.get("class", None) or []):
154 value_xml += self.parse_ref_reflinks(el)
155 continue
156 continue
158 if isinstance(el, str):
159 value_xml += el
160 continue
162 return JatsBase.bake_ref(value_xml)
164 def parse_ref_reflinks(self, reflink_tag: Tag):
165 value_xml = ""
166 for link in reflink_tag.children:
167 if not isinstance(link, Tag):
168 continue
169 ref_href = link.get("href")
170 if not isinstance(ref_href, str):
171 continue
172 ref_href = escape(ref_href)
173 if ref_href.startswith("http://www.zentralblatt-math.org/zmath/en/advanced/?q=an:"):
174 value_xml += get_ext_link_xml(
175 ref_href,
176 ref_href.removeprefix(
177 "http://www.zentralblatt-math.org/zmath/en/advanced/?q=an:"
178 ),
179 "zbl-item-id",
180 )
181 continue
182 if ref_href.startswith("http://dx.doi.org/"):
183 value_xml += get_ext_link_xml(
184 ref_href,
185 ref_href.removeprefix("http://dx.doi.org/"),
186 "doi",
187 )
188 continue
189 value_xml += get_ext_link_xml(
190 ref_href,
191 escape(cleanup_str(link.text)),
192 "uri",
193 )
195 return value_xml