Coverage for src/crawler/by_source/emis_aaa

1from urllib.parse import urljoin

3import regex

4from bs4 import BeautifulSoup, Tag

5from ptf.cmds.xml.jats.builder.citation import get_article_title_xml, get_ext_link_xml

6from ptf.cmds.xml.jats.jats_parser import JatsBase

7from ptf.cmds.xml.xml_utils import escape

8from ptf.model_data import (

9 IssueData,

10 create_abstract,

11 create_articledata,

12 create_contributor,

13)

15from crawler.base_crawler import BaseCollectionCrawler

16from crawler.utils import add_pdf_link_to_xarticle, cleanup_str

19class Emis_aaaCrawler(BaseCollectionCrawler):

20 source_name = "European Mathematical Information Service"

21 source_domain = "EMIS_AAA"

22 source_website = "https://www.emis.de"

24 issue_re = regex.compile(

25 r"Volume (?P<volume>\d+) \((?P<year>\d+)\)(?:, Issue (?P<number>\d+))?"

26 )

27 doi_re = regex.compile(r"(?:doi:|http:\/\/dx\.doi\.org\/)(?P<doi>10\.[\w\.\/]+\/[\w\.\/]+)")

29 def parse_collection_content(self, content):

30 xissues = []

32 soup = BeautifulSoup(content, "html5lib")

33 a_tags = soup.select("a[href^='Volume']")

34 for a_tag in a_tags:

35 href = a_tag.get("href")

36 if not isinstance(href, str):

37 raise ValueError("Couldn't parse ")

38 xissues.extend(self.prefetch_aaa_issues(urljoin(self.collection_url, href)))

39 return xissues

41 # Volumes and issues are defined inside the issue pages

42 # No choice but to fetch everything in parse_collection_content

43 def prefetch_aaa_issues(self, link: str):

44 """

45 Parses one AAA Volume page and returns an iterable of issues

46 """

47 xissues_dict: dict[str, IssueData] = {}

48 content = self.download_file(link)

49 soup = BeautifulSoup(content, "html5lib")

50 article_tags = soup.select("#ctl00_ContentPlaceHolder1_LblArticles li")

51 for index, a_tag in enumerate(article_tags):

52 href = a_tag.select_one("a").get("href")

53 if not isinstance(href, str):

54 raise ValueError("Couldn't parse article link")

56 xarticle = create_articledata()

57 xarticle.pid = "a" + str(index)

58 xarticle.url = urljoin(link, href)

60 issue_search = self.issue_re.search(cleanup_str(a_tag.text))

61 if not issue_search:

62 raise ValueError("Couldn't parse issue data")

63 issue_data = issue_search.groupdict()

65 issue_pid = self.get_issue_pid(

66 self.collection_id,

67 issue_data["year"],

68 issue_data["volume"],

69 issue_data.get("number", None),

70 )

72 if issue_pid not in xissues_dict:

73 xissues_dict[issue_pid] = self.create_xissue(

74 link,

75 year=issue_data["year"],

76 volume_number=issue_data["volume"],

77 issue_number=issue_data.get("number", None),

78 )

80 xissues_dict[issue_pid].articles.append(xarticle)

81 return xissues_dict.values()

83 def parse_article_content(self, content, xissue, xarticle, url):

84 soup = BeautifulSoup(content, "html5lib")

86 title_tag = soup.select_one(".middle_content h2")

87 xarticle.title_tex = cleanup_str(title_tag.text)

89 authors_tag = soup.select_one(".middle_content h1, .middle_content .author_gp")

91 if authors_tag:

92 # Remove affiliations from author links

93 # RFE : parse author affiliations

94 sup_tags = authors_tag.select("sup")

95 for sup in sup_tags:

96 sup.decompose()

97 del sup_tags

99 authors_str = cleanup_str(authors_tag.text)

100 authors_str.replace(", and ", ", ")

101 for author in authors_str.split(", "):

102 if cleanup_str(author) == "":

103 raise ValueError("Invalid author")

104 xarticle.contributors.append(create_contributor(role="author", string_name=author))

105

106 doi_tag = soup.select_one(".middle_content pre")

107 if doi_tag:

108 doi_search = self.doi_re.search(doi_tag.text)

109 if doi_search:

110 doi = doi_search.group("doi")

111 xarticle.doi = doi

112

113 abstract_header = soup.select_one("h4:-soup-contains-own('Abstract')")

114 if abstract_header:

115 abstract_tag = abstract_header.parent.select_one("p")

116 if abstract_tag:

117 xarticle.abstracts.append(

118 create_abstract(

119 tag="abstract",

120 lang=xarticle.lang,

121 value_tex=escape(cleanup_str(abstract_tag.text)),

122 )

123 )

124

125 references_header = soup.select_one("h4:-soup-contains-own('References')")

126 if references_header:

127 references_tags = references_header.parent.select("ol > li")

128 bibitems = []

129 for ref_tag in references_tags:

130 bibitems.append(self.parse_ref(ref_tag))

131 if len(bibitems) > 0:

132 xarticle.abstracts.append(JatsBase.compile_refs(bibitems))

133

134 pdf_tag = soup.select_one("a.full_text_pdf")

135

136 pdf_href = pdf_tag.get("href")

137 if not isinstance(pdf_href, str):

138 raise ValueError("Couldn't parse pdf href")

139 add_pdf_link_to_xarticle(xarticle, urljoin(url, pdf_href))

140 return xarticle

141

142 def parse_ref(self, tag: Tag):

143 value_xml = ""

144 for el in tag.children:

145 if isinstance(el, Tag):

146 if el.name == "i":

147 value_xml += get_article_title_xml(cleanup_str(el.text))

148 continue

149

150 if "reflinks" in (el.get("class", []) or []):

151 value_xml += self.parse_ref_reflinks(el)

152 continue

153 continue

154

155 if isinstance(el, str):

156 value_xml += el

157 continue

158

159 return JatsBase.bake_ref(value_xml)

160

161 def parse_ref_reflinks(self, reflink_tag: Tag):

162 value_xml = ""

163 for link in reflink_tag.children:

164 if not isinstance(link, Tag):

165 continue

166 ref_href = link.get("href")

167 if not isinstance(ref_href, str):

168 continue

169 ref_href = escape(ref_href)

170 if ref_href.startswith("http://www.zentralblatt-math.org/zmath/en/advanced/?q=an:"):

171 value_xml += get_ext_link_xml(

172 ref_href,

173 ref_href.removeprefix(

174 "http://www.zentralblatt-math.org/zmath/en/advanced/?q=an:"

175 ),

176 "zbl-item-id",

177 )

178 continue

179 if ref_href.startswith("http://dx.doi.org/"):

180 value_xml += get_ext_link_xml(

181 ref_href,

182 ref_href.removeprefix("http://dx.doi.org/"),

183 "doi",

184 )

185 continue

186 value_xml += get_ext_link_xml(

187 ref_href,

188 escape(cleanup_str(link.text)),

189 "uri",

190 )

191

192 return value_xml

Coverage for src/crawler/by_source/emis_aaa_crawler.py: 11%

119 statements