Coverage for src/crawler/by_source/dmlbul_crawler.py: 94%

83 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2025-02-14 14:36 +0000

1import json 

2 

3import regex 

4from ptf.model_data import ( 

5 ArticleData, 

6 IssueData, 

7 create_abstract, 

8 create_articledata, 

9 create_contributor, 

10 create_subj, 

11) 

12 

13from crawler.base_crawler import BaseCollectionCrawler 

14from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

15 

16 

17# TODO : PID FCAA_2006_9_2_a5 : errata on the same page as article (2 pdfs) 

18# Handle this by creating another article as an errata http://www.numdam.org/item/SEDP_1971-1972____A1_0/ 

19class DmlbulCrawler(BaseCollectionCrawler): 

20 source_name = "Bulgarian Digital Mathematics Library" 

21 source_domain = "DMLBUL" 

22 source_website = "https://buldml.math.bas.bg" 

23 

24 periode_begin = 0 

25 periode_end = 9999 

26 

27 # Regexes to detect issue metadata, from most specific to least 

28 issue_regexes: list[str] = [ 

29 # MEM 

30 r"[\w ]+, Vol\. (?P<volume>\d+), No (?P<number>[\d\-\w]+), \((?P<year>\d+)\)(?:, +(?P<pagestart>\d+)p-(?P<pageend>\d+)p)?", 

31 r"\((?P<year>\d+)\).+, (?:(?P<volume>\d+)(?:\((?P<number>[\d\w\-−–]+)\))?), (?P<pagestart>\d+)(?: ?[\-−–] ?(?P<pageend>\d+))?\.", 

32 # SMJ2 

33 r"Volume (?P<volume>\d+), Number (?P<number>[\d\w\-]+), (?P<year>\d+), pp. (?P<pagestart>\d+)[\-−](?P<pageend>\d+)", 

34 r"\((?P<year>\d+)\).+, (?:(?P<volume>\d+)(?:\((?P<number>[\d\w\-−–]+)\))?).", 

35 ] 

36 

37 def parse_collection_content(self, content): 

38 """ 

39 Parse the HTML page of Annals of Math and returns a list of xissue. 

40 Each xissue has its pid/volume/number/year metadata + its url 

41 

42 self.periode is set at the end based on the xissue years of the HTML page 

43 """ 

44 data = json.loads(content) 

45 id = data["_id"] 

46 articles = self.query_article_list(id) 

47 issues = self.extract_issue_list(articles) 

48 return issues 

49 

50 def query_article_list(self, id: str): 

51 articles_data = [] 

52 count = float("inf") 

53 while len(articles_data) < count: 

54 request = self.session.post( 

55 "https://buldml.math.bas.bg/api/object/search", 

56 json={ 

57 "search": [ 

58 {"match": {"_t": "ct:Item", "category._rr": id, "*oid": ["category._rr"]}} 

59 ], 

60 "skip": len(articles_data), 

61 "limit": min(200, count - len(articles_data)), 

62 "add": {"identifier": "$identifier", "_alias": "$_alias"}, 

63 "sort": {"issue-date.year": 1, "_id": 1}, 

64 }, 

65 ) 

66 str_data = self.decode_response(request) 

67 data = json.loads(str_data) 

68 count = data["count"] 

69 articles_data.extend(data["data"]) 

70 return articles_data 

71 

72 def extract_issue_list(self, article_list: list[dict]): 

73 issues: dict[str, IssueData] = {} 

74 for _a in article_list: 

75 article = create_articledata() 

76 issue_str = cleanup_str(_a["identifier"]["citation"]["_v"]) 

77 

78 issue_re = None 

79 re_index = 0 

80 MAX_RETRIES = len(self.issue_regexes) 

81 while issue_re is None and re_index < MAX_RETRIES: 

82 issue_re = regex.search(self.issue_regexes[re_index], issue_str) 

83 re_index += 1 

84 

85 if not issue_re: 85 ↛ 86line 85 didn't jump to line 86 because the condition on line 85 was never true

86 raise ValueError( 

87 f"[{self.source_domain}] {self.collection_id} : Article cannot be parsed" 

88 ) 

89 issue_dict = issue_re.groupdict() 

90 

91 if issue_dict.get("pagestart", None): 

92 article.fpage = issue_dict["pagestart"] 

93 if issue_dict.get("pageend", None): 

94 article.lpage = issue_dict["pageend"] 

95 

96 article.url = "https://buldml.math.bas.bg/api/object/a/" + _a["_alias"]["_v"] 

97 

98 pid = self.get_issue_pid( 

99 self.collection_id, 

100 issue_dict["year"], 

101 issue_dict["volume"], 

102 issue_dict.get("number", None), 

103 ) 

104 if pid in issues: 

105 issue = issues[pid] 

106 else: 

107 issue = self.create_xissue( 

108 None, issue_dict["year"], issue_dict["volume"], issue_dict.get("number", None) 

109 ) 

110 issues[pid] = issue 

111 

112 article.pid = "a" + str(len(issues[pid].articles)) 

113 issue.articles.append(article) 

114 return list(issues.values()) 

115 

116 def parse_article_content( 

117 self, 

118 content: str, 

119 xissue: IssueData, 

120 xarticle: ArticleData, 

121 url: str, 

122 pid: str, 

123 ): 

124 data = json.loads(content) 

125 xarticle.pid = pid 

126 # title 

127 xarticle.title_tex = data["title"]["_v"] 

128 # lang 

129 xarticle.lang = data["language"]["_v"] 

130 if xarticle.lang == "other": 

131 xarticle.lang = "und" 

132 # authors 

133 for _a in data["authors"]["_v"]: 

134 author = create_contributor(string_name=_a, role="author") 

135 xarticle.contributors.append(author) 

136 # abstracts 

137 if "abstract" in data: 

138 xarticle.abstracts.append( 

139 create_abstract( 

140 lang=xarticle.lang, value_tex=data["abstract"]["_v"], tag="abstract" 

141 ) 

142 ) 

143 # keywords 

144 if "subject" in data: 144 ↛ 149line 144 didn't jump to line 149 because the condition on line 144 was always true

145 for _k in data["subject"]["_v"]: 

146 keyword = create_subj(lang=xarticle.lang, value=_k) 

147 xarticle.kwds.append(keyword) 

148 # pdf 

149 if not xarticle.url: 149 ↛ 150line 149 didn't jump to line 150 because the condition on line 149 was never true

150 raise ValueError( 

151 f"[{self.source_domain}] {self.collection_id} : Article doesn't have an url" 

152 ) 

153 if "uri" in data["identifier"]: 

154 url = data["identifier"]["uri"]["_v"] 

155 else: 

156 url = "https://hdl.handle.net/10525/" + data["_alias"]["_v"] 

157 print( 

158 f"[{self.source_domain}] {xarticle.pid} : Article URL not found, falling back with handle.net. Please check link validity : {url}" 

159 ) 

160 xarticle.url = url 

161 if len(data["media"]) == 0: 161 ↛ 162line 161 didn't jump to line 162 because the condition on line 161 was never true

162 raise ValueError(f"[{self.source_domain}] {self.collection_id} : PDF Not found") 

163 for media in data["media"]: 

164 add_pdf_link_to_xarticle( 

165 xarticle, "https://buldml.math.bas.bg/asset/default/" + media["file"]["_v"] 

166 ) 

167 return xarticle