Coverage for src / crawler / by_source / emis_hoa_crawler.py: 14%
121 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-11 14:57 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-11 14:57 +0000
1from urllib.parse import urljoin
3import regex
4from bs4 import BeautifulSoup, Tag
5from ptf.cmds.xml.jats.builder.references import get_article_title_xml, get_ext_link_xml
6from ptf.cmds.xml.jats.jats_parser import JatsBase
7from ptf.cmds.xml.xml_utils import escape
8from ptf.model_data import (
9 IssueData,
10 create_abstract,
11 create_articledata,
12 create_contributor,
13)
15from crawler.base_crawler import BaseCollectionCrawler
16from crawler.crawler_utils import get_issue_pid
17from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
20class Emis_hoaCrawler(BaseCollectionCrawler):
21 source_name = "European Mathematical Information Service"
22 source_domain = "EMIS_HOA"
23 source_website = "https://www.emis.de"
25 issue_re = regex.compile(
26 r"Volume (?P<volume>\d+) \((?P<year>\d+)\)(?:, Issue (?P<number>\d+))?"
27 )
28 doi_re = regex.compile(r"(?:doi:|http:\/\/dx\.doi\.org\/)(?P<doi>10\.[\w\.\/]+\/[\w\.\/]+)")
30 verify = False
32 @classmethod
33 def get_view_id(cls):
34 return "EMIS"
36 def parse_collection_content(self, content):
37 xissues = []
39 soup = BeautifulSoup(content, "html5lib")
40 a_tags = soup.select("a[href^='Volume']")
41 for a_tag in a_tags:
42 href = a_tag.get("href")
43 if not isinstance(href, str):
44 raise ValueError("Couldn't parse ")
45 xissues.extend(self.prefetch_aaa_issues(urljoin(self.collection_url, href)))
46 return xissues
48 # Volumes and issues are defined inside the issue pages
49 # No choice but to fetch everything in parse_collection_content
50 def prefetch_aaa_issues(self, link: str):
51 """
52 Parses one AAA Volume page and returns an iterable of issues
53 """
54 xissues_dict: dict[str, IssueData] = {}
55 content = self.download_file(link)
56 soup = BeautifulSoup(content, "html5lib")
57 article_tags = soup.select("#ctl00_ContentPlaceHolder1_LblArticles li")
58 for index, a_tag in enumerate(article_tags):
59 href = a_tag.select_one("a").get("href")
60 if not isinstance(href, str):
61 raise ValueError("Couldn't parse article link")
63 xarticle = create_articledata()
64 xarticle.pid = "a" + str(index)
65 xarticle.url = urljoin(link, href)
67 issue_search = self.issue_re.search(cleanup_str(a_tag.text))
68 if not issue_search:
69 raise ValueError("Couldn't parse issue data")
70 issue_data = issue_search.groupdict()
72 issue_pid = get_issue_pid(
73 self.collection_id,
74 issue_data["year"],
75 issue_data["volume"],
76 issue_data.get("number", None),
77 )
79 if issue_pid not in xissues_dict:
80 xissues_dict[issue_pid] = self.create_xissue(
81 link,
82 year=issue_data["year"],
83 volume_number=issue_data["volume"],
84 issue_number=issue_data.get("number", None),
85 )
87 xissues_dict[issue_pid].articles.append(xarticle)
88 return xissues_dict.values()
90 def parse_article_content(self, content, xissue, xarticle, url):
91 soup = BeautifulSoup(content, "html5lib")
93 title_tag = soup.select_one(".middle_content h2")
94 xarticle.title_tex = cleanup_str(title_tag.text)
96 authors_tag = soup.select_one(".middle_content h1, .middle_content .author_gp")
98 if authors_tag:
99 # Remove affiliations from author links
100 # RFE : parse author affiliations
101 sup_tags = authors_tag.select("sup")
102 for sup in sup_tags:
103 sup.decompose()
104 del sup_tags
106 authors_str = cleanup_str(authors_tag.text)
107 authors_str.replace(", and ", ", ")
108 for author in authors_str.split(", "):
109 if cleanup_str(author) == "":
110 raise ValueError("Invalid author")
111 xarticle.contributors.append(create_contributor(role="author", string_name=author))
113 doi_tag = soup.select_one(".middle_content pre")
114 if doi_tag:
115 doi_search = self.doi_re.search(doi_tag.text)
116 if doi_search:
117 doi = doi_search.group("doi")
118 xarticle.doi = doi
120 abstract_header = soup.select_one("h4:-soup-contains-own('Abstract')")
121 if abstract_header:
122 abstract_tag = abstract_header.parent.select_one("p")
123 if abstract_tag:
124 xarticle.abstracts.append(
125 create_abstract(
126 lang=xarticle.lang,
127 value_tex=escape(cleanup_str(abstract_tag.text)),
128 )
129 )
131 references_header = soup.select_one("h4:-soup-contains-own('References')")
132 if references_header:
133 references_tags = references_header.parent.select("ol > li")
135 for ref_tag in references_tags:
136 xarticle.bibitems.append(self.parse_ref(ref_tag))
138 pdf_tag = soup.select_one("a.full_text_pdf")
140 pdf_href = pdf_tag.get("href")
141 if not isinstance(pdf_href, str):
142 raise ValueError("Couldn't parse pdf href")
143 add_pdf_link_to_xarticle(xarticle, urljoin(url, pdf_href))
144 return xarticle
146 def parse_ref(self, tag: Tag):
147 value_xml = ""
148 for el in tag.children:
149 if isinstance(el, Tag):
150 if el.name == "i":
151 value_xml += get_article_title_xml(cleanup_str(el.text))
152 continue
154 if "reflinks" in (el.get("class", None) or []):
155 value_xml += self.parse_ref_reflinks(el)
156 continue
157 continue
159 if isinstance(el, str):
160 value_xml += el
161 continue
163 return JatsBase.bake_ref(value_xml)
165 def parse_ref_reflinks(self, reflink_tag: Tag):
166 value_xml = ""
167 for link in reflink_tag.children:
168 if not isinstance(link, Tag):
169 continue
170 ref_href = link.get("href")
171 if not isinstance(ref_href, str):
172 continue
173 ref_href = escape(ref_href)
174 if ref_href.startswith("http://www.zentralblatt-math.org/zmath/en/advanced/?q=an:"):
175 value_xml += get_ext_link_xml(
176 ref_href,
177 ref_href.removeprefix(
178 "http://www.zentralblatt-math.org/zmath/en/advanced/?q=an:"
179 ),
180 "zbl-item-id",
181 )
182 continue
183 if ref_href.startswith("http://dx.doi.org/"):
184 value_xml += get_ext_link_xml(
185 ref_href,
186 ref_href.removeprefix("http://dx.doi.org/"),
187 "doi",
188 )
189 continue
190 value_xml += get_ext_link_xml(
191 ref_href,
192 escape(cleanup_str(link.text)),
193 "uri",
194 )
196 return value_xml