Coverage for src/crawler/by_source/emis_hoa

1from urllib.parse import urljoin

3import regex

4from bs4 import BeautifulSoup, Tag

5from ptf.cmds.xml.jats.builder.references import get_article_title_xml, get_ext_link_xml

6from ptf.cmds.xml.jats.jats_parser import JatsBase

7from ptf.cmds.xml.xml_utils import escape

8from ptf.model_data import (

9 IssueData,

10 create_abstract,

11 create_articledata,

12 create_contributor,

13)

15from crawler.base_crawler import BaseCollectionCrawler

16from crawler.crawler_utils import get_issue_pid

17from crawler.utils import add_pdf_link_to_xarticle, cleanup_str

20class Emis_hoaCrawler(BaseCollectionCrawler):

21 source_name = "European Mathematical Information Service"

22 source_domain = "EMIS_HOA"

23 source_website = "https://www.emis.de"

25 issue_re = regex.compile(

26 r"Volume (?P<volume>\d+) \((?P<year>\d+)\)(?:, Issue (?P<number>\d+))?"

27 )

28 doi_re = regex.compile(r"(?:doi:|http:\/\/dx\.doi\.org\/)(?P<doi>10\.[\w\.\/]+\/[\w\.\/]+)")

30 verify = False

32 @classmethod

33 def get_view_id(cls):

34 return "EMIS"

36 def parse_collection_content(self, content):

37 xissues = []

39 soup = BeautifulSoup(content, "html5lib")

40 a_tags = soup.select("a[href^='Volume']")

41 for a_tag in a_tags:

42 href = a_tag.get("href")

43 if not isinstance(href, str):

44 raise ValueError("Couldn't parse ")

45 xissues.extend(self.prefetch_aaa_issues(urljoin(self.collection_url, href)))

46 return xissues

48 # Volumes and issues are defined inside the issue pages

49 # No choice but to fetch everything in parse_collection_content

50 def prefetch_aaa_issues(self, link: str):

51 """

52 Parses one AAA Volume page and returns an iterable of issues

53 """

54 xissues_dict: dict[str, IssueData] = {}

55 content = self.download_file(link)

56 soup = BeautifulSoup(content, "html5lib")

57 article_tags = soup.select("#ctl00_ContentPlaceHolder1_LblArticles li")

58 for index, a_tag in enumerate(article_tags):

59 href = a_tag.select_one("a").get("href")

60 if not isinstance(href, str):

61 raise ValueError("Couldn't parse article link")

63 xarticle = create_articledata()

64 xarticle.pid = "a" + str(index)

65 xarticle.url = urljoin(link, href)

67 issue_search = self.issue_re.search(cleanup_str(a_tag.text))

68 if not issue_search:

69 raise ValueError("Couldn't parse issue data")

70 issue_data = issue_search.groupdict()

72 issue_pid = get_issue_pid(

73 self.collection_id,

74 issue_data["year"],

75 issue_data["volume"],

76 issue_data.get("number", None),

77 )

79 if issue_pid not in xissues_dict:

80 xissues_dict[issue_pid] = self.create_xissue(

81 link,

82 year=issue_data["year"],

83 volume_number=issue_data["volume"],

84 issue_number=issue_data.get("number", None),

85 )

87 xissues_dict[issue_pid].articles.append(xarticle)

88 return xissues_dict.values()

90 def parse_article_content(self, content, xissue, xarticle, url):

91 soup = BeautifulSoup(content, "html5lib")

93 title_tag = soup.select_one(".middle_content h2")

94 xarticle.title_tex = cleanup_str(title_tag.text)

96 authors_tag = soup.select_one(".middle_content h1, .middle_content .author_gp")

98 if authors_tag:

99 # Remove affiliations from author links

100 # RFE : parse author affiliations

101 sup_tags = authors_tag.select("sup")

102 for sup in sup_tags:

103 sup.decompose()

104 del sup_tags

105

106 authors_str = cleanup_str(authors_tag.text)

107 authors_str.replace(", and ", ", ")

108 for author in authors_str.split(", "):

109 if cleanup_str(author) == "":

110 raise ValueError("Invalid author")

111 xarticle.contributors.append(create_contributor(role="author", string_name=author))

112

113 doi_tag = soup.select_one(".middle_content pre")

114 if doi_tag:

115 doi_search = self.doi_re.search(doi_tag.text)

116 if doi_search:

117 doi = doi_search.group("doi")

118 xarticle.doi = doi

119

120 abstract_header = soup.select_one("h4:-soup-contains-own('Abstract')")

121 if abstract_header:

122 abstract_tag = abstract_header.parent.select_one("p")

123 if abstract_tag:

124 xarticle.abstracts.append(

125 create_abstract(

126 lang=xarticle.lang,

127 value_tex=escape(cleanup_str(abstract_tag.text)),

128 )

129 )

130

131 references_header = soup.select_one("h4:-soup-contains-own('References')")

132 if references_header:

133 references_tags = references_header.parent.select("ol > li")

134

135 for ref_tag in references_tags:

136 xarticle.bibitems.append(self.parse_ref(ref_tag))

137

138 pdf_tag = soup.select_one("a.full_text_pdf")

139

140 pdf_href = pdf_tag.get("href")

141 if not isinstance(pdf_href, str):

142 raise ValueError("Couldn't parse pdf href")

143 add_pdf_link_to_xarticle(xarticle, urljoin(url, pdf_href))

144 return xarticle

145

146 def parse_ref(self, tag: Tag):

147 value_xml = ""

148 for el in tag.children:

149 if isinstance(el, Tag):

150 if el.name == "i":

151 value_xml += get_article_title_xml(cleanup_str(el.text))

152 continue

153

154 if "reflinks" in (el.get("class", None) or []):

155 value_xml += self.parse_ref_reflinks(el)

156 continue

157 continue

158

159 if isinstance(el, str):

160 value_xml += el

161 continue

162

163 return JatsBase.bake_ref(value_xml)

164

165 def parse_ref_reflinks(self, reflink_tag: Tag):

166 value_xml = ""

167 for link in reflink_tag.children:

168 if not isinstance(link, Tag):

169 continue

170 ref_href = link.get("href")

171 if not isinstance(ref_href, str):

172 continue

173 ref_href = escape(ref_href)

174 if ref_href.startswith("http://www.zentralblatt-math.org/zmath/en/advanced/?q=an:"):

175 value_xml += get_ext_link_xml(

176 ref_href,

177 ref_href.removeprefix(

178 "http://www.zentralblatt-math.org/zmath/en/advanced/?q=an:"

179 ),

180 "zbl-item-id",

181 )

182 continue

183 if ref_href.startswith("http://dx.doi.org/"):

184 value_xml += get_ext_link_xml(

185 ref_href,

186 ref_href.removeprefix("http://dx.doi.org/"),

187 "doi",

188 )

189 continue

190 value_xml += get_ext_link_xml(

191 ref_href,

192 escape(cleanup_str(link.text)),

193 "uri",

194 )

195

196 return value_xml

Coverage for src / crawler / by_source / emis_hoa_crawler.py: 14%

121 statements