Coverage for src/crawler/by_source/dmlbul

1import json

3import regex

4from ptf.model_data import (

5 ArticleData,

6 IssueData,

7 create_abstract,

8 create_articledata,

9 create_contributor,

10 create_subj,

11)

13from crawler.base_crawler import BaseCollectionCrawler

14from crawler.utils import add_pdf_link_to_xarticle, cleanup_str

17# TODO : PID FCAA_2006_9_2_a5 : errata on the same page as article (2 pdfs)

18# Handle this by creating another article as an errata http://www.numdam.org/item/SEDP_1971-1972____A1_0/

19class DmlbulCrawler(BaseCollectionCrawler):

20 source_name = "Bulgarian Digital Mathematics Library"

21 source_domain = "DMLBUL"

22 source_website = "https://buldml.math.bas.bg"

24 periode_begin = 0

25 periode_end = 9999

27 # Regexes to detect issue metadata, from most specific to least

28 issue_regexes: list[str] = [

29 # MEM

30 r"[\w ]+, Vol\. (?P<volume>\d+), No (?P<number>[\d\-\w]+), $(?P<year>\d+)$(?:, +(?P<pagestart>\d+)p-(?P<pageend>\d+)p)?",

31 r"$(?P<year>\d+)$.+, (?:(?P<volume>\d+)(?:$(?P<number>[\d\w\-−–]+)$)?), (?P<pagestart>\d+)(?: ?[\-−–] ?(?P<pageend>\d+))?\.",

32 # SMJ2

33 r"Volume (?P<volume>\d+), Number (?P<number>[\d\w\-]+), (?P<year>\d+), pp. (?P<pagestart>\d+)[\-−](?P<pageend>\d+)",

34 r"$(?P<year>\d+)$.+, (?:(?P<volume>\d+)(?:$(?P<number>[\d\w\-−–]+)$)?).",

35 ]

37 def parse_collection_content(self, content):

38 """

39 Parse the HTML page of Annals of Math and returns a list of xissue.

40 Each xissue has its pid/volume/number/year metadata + its url

42 self.periode is set at the end based on the xissue years of the HTML page

43 """

44 data = json.loads(content)

45 id = data["_id"]

46 articles = self.query_article_list(id)

47 issues = self.extract_issue_list(articles)

48 return issues

50 def query_article_list(self, id: str):

51 articles_data = []

52 count = float("inf")

53 while len(articles_data) < count:

54 request = self.session.post(

55 "https://buldml.math.bas.bg/api/object/search",

56 json={

57 "search": [

58 {"match": {"_t": "ct:Item", "category._rr": id, "*oid": ["category._rr"]}}

59 ],

60 "skip": len(articles_data),

61 "limit": min(200, count - len(articles_data)),

62 "add": {"identifier": "$identifier", "_alias": "$_alias"},

63 "sort": {"issue-date.year": 1, "_id": 1},

64 },

65 )

66 str_data = self.decode_response(request)

67 data = json.loads(str_data)

68 count = data["count"]

69 articles_data.extend(data["data"])

70 return articles_data

72 def extract_issue_list(self, article_list: list[dict]):

73 issues: dict[str, IssueData] = {}

74 for _a in article_list:

75 article = create_articledata()

76 issue_str = cleanup_str(_a["identifier"]["citation"]["_v"])

78 issue_re = None

79 re_index = 0

80 MAX_RETRIES = len(self.issue_regexes)

81 while issue_re is None and re_index < MAX_RETRIES:

82 issue_re = regex.search(self.issue_regexes[re_index], issue_str)

83 re_index += 1

85 if not issue_re: 85 ↛ 86line 85 didn't jump to line 86 because the condition on line 85 was never true

86 raise ValueError(

87 f"[{self.source_domain}] {self.collection_id} : Article cannot be parsed"

88 )

89 issue_dict = issue_re.groupdict()

91 if issue_dict.get("pagestart", None):

92 article.fpage = issue_dict["pagestart"]

93 if issue_dict.get("pageend", None):

94 article.lpage = issue_dict["pageend"]

96 article.url = "https://buldml.math.bas.bg/api/object/a/" + _a["_alias"]["_v"]

98 pid = self.get_issue_pid(

99 self.collection_id,

100 issue_dict["year"],

101 issue_dict["volume"],

102 issue_dict.get("number", None),

103 )

104 if pid in issues:

105 issue = issues[pid]

106 else:

107 issue = self.create_xissue(

108 None, issue_dict["year"], issue_dict["volume"], issue_dict.get("number", None)

109 )

110 issues[pid] = issue

111

112 article.pid = "a" + str(len(issues[pid].articles))

113 issue.articles.append(article)

114 return list(issues.values())

115

116 def parse_article_content(

117 self,

118 content: str,

119 xissue: IssueData,

120 xarticle: ArticleData,

121 url: str,

122 pid: str,

123 ):

124 data = json.loads(content)

125 xarticle.pid = pid

126 # title

127 xarticle.title_tex = data["title"]["_v"]

128 # lang

129 xarticle.lang = data["language"]["_v"]

130 if xarticle.lang == "other":

131 xarticle.lang = "und"

132 # authors

133 for _a in data["authors"]["_v"]:

134 author = create_contributor(string_name=_a, role="author")

135 xarticle.contributors.append(author)

136 # abstracts

137 if "abstract" in data:

138 xarticle.abstracts.append(

139 create_abstract(

140 lang=xarticle.lang, value_tex=data["abstract"]["_v"], tag="abstract"

141 )

142 )

143 # keywords

144 if "subject" in data: 144 ↛ 149line 144 didn't jump to line 149 because the condition on line 144 was always true

145 for _k in data["subject"]["_v"]:

146 keyword = create_subj(lang=xarticle.lang, value=_k)

147 xarticle.kwds.append(keyword)

148 # pdf

149 if not xarticle.url: 149 ↛ 150line 149 didn't jump to line 150 because the condition on line 149 was never true

150 raise ValueError(

151 f"[{self.source_domain}] {self.collection_id} : Article doesn't have an url"

152 )

153 if "uri" in data["identifier"]:

154 url = data["identifier"]["uri"]["_v"]

155 else:

156 url = "https://hdl.handle.net/10525/" + data["_alias"]["_v"]

157 print(

158 f"[{self.source_domain}] {xarticle.pid} : Article URL not found, falling back with handle.net. Please check link validity : {url}"

159 )

160 xarticle.url = url

161 if len(data["media"]) == 0: 161 ↛ 162line 161 didn't jump to line 162 because the condition on line 161 was never true

162 raise ValueError(f"[{self.source_domain}] {self.collection_id} : PDF Not found")

163 for media in data["media"]:

164 add_pdf_link_to_xarticle(

165 xarticle, "https://buldml.math.bas.bg/asset/default/" + media["file"]["_v"]

166 )

167 return xarticle

Coverage for src/crawler/by_source/dmlbul_crawler.py: 94%

83 statements