Coverage for src/crawler/by_source/emis_aaa_crawler.py: 11%
119 statements
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-16 07:44 +0000
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-16 07:44 +0000
1from urllib.parse import urljoin
3import regex
4from bs4 import BeautifulSoup, Tag
5from ptf.cmds.xml.jats.builder.citation import get_article_title_xml, get_ext_link_xml
6from ptf.cmds.xml.jats.jats_parser import JatsBase
7from ptf.cmds.xml.xml_utils import escape
8from ptf.model_data import (
9 IssueData,
10 create_abstract,
11 create_articledata,
12 create_contributor,
13)
15from crawler.base_crawler import BaseCollectionCrawler
16from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
19class Emis_aaaCrawler(BaseCollectionCrawler):
20 source_name = "European Mathematical Information Service"
21 source_domain = "EMIS_AAA"
22 source_website = "https://www.emis.de"
24 issue_re = regex.compile(
25 r"Volume (?P<volume>\d+) \((?P<year>\d+)\)(?:, Issue (?P<number>\d+))?"
26 )
27 doi_re = regex.compile(r"(?:doi:|http:\/\/dx\.doi\.org\/)(?P<doi>10\.[\w\.\/]+\/[\w\.\/]+)")
29 def parse_collection_content(self, content):
30 xissues = []
32 soup = BeautifulSoup(content, "html5lib")
33 a_tags = soup.select("a[href^='Volume']")
34 for a_tag in a_tags:
35 href = a_tag.get("href")
36 if not isinstance(href, str):
37 raise ValueError("Couldn't parse ")
38 xissues.extend(self.prefetch_aaa_issues(urljoin(self.collection_url, href)))
39 return xissues
41 # Volumes and issues are defined inside the issue pages
42 # No choice but to fetch everything in parse_collection_content
43 def prefetch_aaa_issues(self, link: str):
44 """
45 Parses one AAA Volume page and returns an iterable of issues
46 """
47 xissues_dict: dict[str, IssueData] = {}
48 content = self.download_file(link)
49 soup = BeautifulSoup(content, "html5lib")
50 article_tags = soup.select("#ctl00_ContentPlaceHolder1_LblArticles li")
51 for index, a_tag in enumerate(article_tags):
52 href = a_tag.select_one("a").get("href")
53 if not isinstance(href, str):
54 raise ValueError("Couldn't parse article link")
56 xarticle = create_articledata()
57 xarticle.pid = "a" + str(index)
58 xarticle.url = urljoin(link, href)
60 issue_search = self.issue_re.search(cleanup_str(a_tag.text))
61 if not issue_search:
62 raise ValueError("Couldn't parse issue data")
63 issue_data = issue_search.groupdict()
65 issue_pid = self.get_issue_pid(
66 self.collection_id,
67 issue_data["year"],
68 issue_data["volume"],
69 issue_data.get("number", None),
70 )
72 if issue_pid not in xissues_dict:
73 xissues_dict[issue_pid] = self.create_xissue(
74 link,
75 year=issue_data["year"],
76 volume_number=issue_data["volume"],
77 issue_number=issue_data.get("number", None),
78 )
80 xissues_dict[issue_pid].articles.append(xarticle)
81 return xissues_dict.values()
83 def parse_article_content(self, content, xissue, xarticle, url):
84 soup = BeautifulSoup(content, "html5lib")
86 title_tag = soup.select_one(".middle_content h2")
87 xarticle.title_tex = cleanup_str(title_tag.text)
89 authors_tag = soup.select_one(".middle_content h1, .middle_content .author_gp")
91 if authors_tag:
92 # Remove affiliations from author links
93 # RFE : parse author affiliations
94 sup_tags = authors_tag.select("sup")
95 for sup in sup_tags:
96 sup.decompose()
97 del sup_tags
99 authors_str = cleanup_str(authors_tag.text)
100 authors_str.replace(", and ", ", ")
101 for author in authors_str.split(", "):
102 if cleanup_str(author) == "":
103 raise ValueError("Invalid author")
104 xarticle.contributors.append(create_contributor(role="author", string_name=author))
106 doi_tag = soup.select_one(".middle_content pre")
107 if doi_tag:
108 doi_search = self.doi_re.search(doi_tag.text)
109 if doi_search:
110 doi = doi_search.group("doi")
111 xarticle.doi = doi
113 abstract_header = soup.select_one("h4:-soup-contains-own('Abstract')")
114 if abstract_header:
115 abstract_tag = abstract_header.parent.select_one("p")
116 if abstract_tag:
117 xarticle.abstracts.append(
118 create_abstract(
119 tag="abstract",
120 lang=xarticle.lang,
121 value_tex=escape(cleanup_str(abstract_tag.text)),
122 )
123 )
125 references_header = soup.select_one("h4:-soup-contains-own('References')")
126 if references_header:
127 references_tags = references_header.parent.select("ol > li")
128 bibitems = []
129 for ref_tag in references_tags:
130 bibitems.append(self.parse_ref(ref_tag))
131 if len(bibitems) > 0:
132 xarticle.abstracts.append(JatsBase.compile_refs(bibitems))
134 pdf_tag = soup.select_one("a.full_text_pdf")
136 pdf_href = pdf_tag.get("href")
137 if not isinstance(pdf_href, str):
138 raise ValueError("Couldn't parse pdf href")
139 add_pdf_link_to_xarticle(xarticle, urljoin(url, pdf_href))
140 return xarticle
142 def parse_ref(self, tag: Tag):
143 value_xml = ""
144 for el in tag.children:
145 if isinstance(el, Tag):
146 if el.name == "i":
147 value_xml += get_article_title_xml(cleanup_str(el.text))
148 continue
150 if "reflinks" in (el.get("class", []) or []):
151 value_xml += self.parse_ref_reflinks(el)
152 continue
153 continue
155 if isinstance(el, str):
156 value_xml += el
157 continue
159 return JatsBase.bake_ref(value_xml)
161 def parse_ref_reflinks(self, reflink_tag: Tag):
162 value_xml = ""
163 for link in reflink_tag.children:
164 if not isinstance(link, Tag):
165 continue
166 ref_href = link.get("href")
167 if not isinstance(ref_href, str):
168 continue
169 ref_href = escape(ref_href)
170 if ref_href.startswith("http://www.zentralblatt-math.org/zmath/en/advanced/?q=an:"):
171 value_xml += get_ext_link_xml(
172 ref_href,
173 ref_href.removeprefix(
174 "http://www.zentralblatt-math.org/zmath/en/advanced/?q=an:"
175 ),
176 "zbl-item-id",
177 )
178 continue
179 if ref_href.startswith("http://dx.doi.org/"):
180 value_xml += get_ext_link_xml(
181 ref_href,
182 ref_href.removeprefix("http://dx.doi.org/"),
183 "doi",
184 )
185 continue
186 value_xml += get_ext_link_xml(
187 ref_href,
188 escape(cleanup_str(link.text)),
189 "uri",
190 )
192 return value_xml