Coverage for src/crawler/by_source/emis_hoa_crawler.py: 12%
120 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-07-30 09:47 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-07-30 09:47 +0000
1from urllib.parse import urljoin
3import regex
4from bs4 import BeautifulSoup, Tag
5from ptf.cmds.xml.jats.builder.citation import get_article_title_xml, get_ext_link_xml
6from ptf.cmds.xml.jats.jats_parser import JatsBase
7from ptf.cmds.xml.xml_utils import escape
8from ptf.model_data import (
9 IssueData,
10 create_abstract,
11 create_articledata,
12 create_contributor,
13)
15from crawler.base_crawler import BaseCollectionCrawler
16from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
19class Emis_hoaCrawler(BaseCollectionCrawler):
20 source_name = "European Mathematical Information Service"
21 source_domain = "EMIS_HOA"
22 source_website = "https://www.emis.de"
24 issue_re = regex.compile(
25 r"Volume (?P<volume>\d+) \((?P<year>\d+)\)(?:, Issue (?P<number>\d+))?"
26 )
27 doi_re = regex.compile(r"(?:doi:|http:\/\/dx\.doi\.org\/)(?P<doi>10\.[\w\.\/]+\/[\w\.\/]+)")
29 verify = False
31 def parse_collection_content(self, content):
32 xissues = []
34 soup = BeautifulSoup(content, "html5lib")
35 a_tags = soup.select("a[href^='Volume']")
36 for a_tag in a_tags:
37 href = a_tag.get("href")
38 if not isinstance(href, str):
39 raise ValueError("Couldn't parse ")
40 xissues.extend(self.prefetch_aaa_issues(urljoin(self.collection_url, href)))
41 return xissues
43 # Volumes and issues are defined inside the issue pages
44 # No choice but to fetch everything in parse_collection_content
45 def prefetch_aaa_issues(self, link: str):
46 """
47 Parses one AAA Volume page and returns an iterable of issues
48 """
49 xissues_dict: dict[str, IssueData] = {}
50 content = self.download_file(link)
51 soup = BeautifulSoup(content, "html5lib")
52 article_tags = soup.select("#ctl00_ContentPlaceHolder1_LblArticles li")
53 for index, a_tag in enumerate(article_tags):
54 href = a_tag.select_one("a").get("href")
55 if not isinstance(href, str):
56 raise ValueError("Couldn't parse article link")
58 xarticle = create_articledata()
59 xarticle.pid = "a" + str(index)
60 xarticle.url = urljoin(link, href)
62 issue_search = self.issue_re.search(cleanup_str(a_tag.text))
63 if not issue_search:
64 raise ValueError("Couldn't parse issue data")
65 issue_data = issue_search.groupdict()
67 issue_pid = self.get_issue_pid(
68 self.collection_id,
69 issue_data["year"],
70 issue_data["volume"],
71 issue_data.get("number", None),
72 )
74 if issue_pid not in xissues_dict:
75 xissues_dict[issue_pid] = self.create_xissue(
76 link,
77 year=issue_data["year"],
78 volume_number=issue_data["volume"],
79 issue_number=issue_data.get("number", None),
80 )
82 xissues_dict[issue_pid].articles.append(xarticle)
83 return xissues_dict.values()
85 def parse_article_content(self, content, xissue, xarticle, url):
86 soup = BeautifulSoup(content, "html5lib")
88 title_tag = soup.select_one(".middle_content h2")
89 xarticle.title_tex = cleanup_str(title_tag.text)
91 authors_tag = soup.select_one(".middle_content h1, .middle_content .author_gp")
93 if authors_tag:
94 # Remove affiliations from author links
95 # RFE : parse author affiliations
96 sup_tags = authors_tag.select("sup")
97 for sup in sup_tags:
98 sup.decompose()
99 del sup_tags
101 authors_str = cleanup_str(authors_tag.text)
102 authors_str.replace(", and ", ", ")
103 for author in authors_str.split(", "):
104 if cleanup_str(author) == "":
105 raise ValueError("Invalid author")
106 xarticle.contributors.append(create_contributor(role="author", string_name=author))
108 doi_tag = soup.select_one(".middle_content pre")
109 if doi_tag:
110 doi_search = self.doi_re.search(doi_tag.text)
111 if doi_search:
112 doi = doi_search.group("doi")
113 xarticle.doi = doi
115 abstract_header = soup.select_one("h4:-soup-contains-own('Abstract')")
116 if abstract_header:
117 abstract_tag = abstract_header.parent.select_one("p")
118 if abstract_tag:
119 xarticle.abstracts.append(
120 create_abstract(
121 tag="abstract",
122 lang=xarticle.lang,
123 value_tex=escape(cleanup_str(abstract_tag.text)),
124 )
125 )
127 references_header = soup.select_one("h4:-soup-contains-own('References')")
128 if references_header:
129 references_tags = references_header.parent.select("ol > li")
130 bibitems = []
131 for ref_tag in references_tags:
132 bibitems.append(self.parse_ref(ref_tag))
133 if len(bibitems) > 0:
134 xarticle.abstracts.append(JatsBase.compile_refs(bibitems))
136 pdf_tag = soup.select_one("a.full_text_pdf")
138 pdf_href = pdf_tag.get("href")
139 if not isinstance(pdf_href, str):
140 raise ValueError("Couldn't parse pdf href")
141 add_pdf_link_to_xarticle(xarticle, urljoin(url, pdf_href))
142 return xarticle
144 def parse_ref(self, tag: Tag):
145 value_xml = ""
146 for el in tag.children:
147 if isinstance(el, Tag):
148 if el.name == "i":
149 value_xml += get_article_title_xml(cleanup_str(el.text))
150 continue
152 if "reflinks" in (el.get("class", []) or []):
153 value_xml += self.parse_ref_reflinks(el)
154 continue
155 continue
157 if isinstance(el, str):
158 value_xml += el
159 continue
161 return JatsBase.bake_ref(value_xml)
163 def parse_ref_reflinks(self, reflink_tag: Tag):
164 value_xml = ""
165 for link in reflink_tag.children:
166 if not isinstance(link, Tag):
167 continue
168 ref_href = link.get("href")
169 if not isinstance(ref_href, str):
170 continue
171 ref_href = escape(ref_href)
172 if ref_href.startswith("http://www.zentralblatt-math.org/zmath/en/advanced/?q=an:"):
173 value_xml += get_ext_link_xml(
174 ref_href,
175 ref_href.removeprefix(
176 "http://www.zentralblatt-math.org/zmath/en/advanced/?q=an:"
177 ),
178 "zbl-item-id",
179 )
180 continue
181 if ref_href.startswith("http://dx.doi.org/"):
182 value_xml += get_ext_link_xml(
183 ref_href,
184 ref_href.removeprefix("http://dx.doi.org/"),
185 "doi",
186 )
187 continue
188 value_xml += get_ext_link_xml(
189 ref_href,
190 escape(cleanup_str(link.text)),
191 "uri",
192 )
194 return value_xml