Coverage for src/crawler/by_source/dmlbul_crawler.py: 94%

80 statements  

« prev     ^ index     » next       coverage.py v7.7.0, created at 2025-04-03 12:36 +0000

1import json 

2 

3import regex 

4from ptf.model_data import ( 

5 ArticleData, 

6 IssueData, 

7 create_abstract, 

8 create_articledata, 

9 create_contributor, 

10 create_subj, 

11) 

12 

13from crawler.base_crawler import BaseCollectionCrawler 

14from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

15 

16 

17# TODO : PID FCAA_2006_9_2_a5 : errata on the same page as article (2 pdfs) 

18# Handle this by creating another article as an errata http://www.numdam.org/item/SEDP_1971-1972____A1_0/ 

19class DmlbulCrawler(BaseCollectionCrawler): 

20 source_name = "Bulgarian Digital Mathematics Library" 

21 source_domain = "DMLBUL" 

22 source_website = "https://buldml.math.bas.bg" 

23 

24 # Regexes to detect issue metadata, from most specific to least 

25 issue_regexes: list[str] = [ 

26 # MEM 

27 r"[\w ]+, Vol\. (?P<volume>\d+), No (?P<number>[\d\-\w]+), \((?P<year>\d+)\)(?:, +(?P<pagestart>\d+)p-(?P<pageend>\d+)p)?", 

28 r"\((?P<year>\d+)\).+, (?:(?P<volume>\d+)(?:\((?P<number>[\d\w\-−–]+)\))?), (?P<pagestart>\d+)(?: ?[\-−–] ?(?P<pageend>\d+))?\.", 

29 # SMJ2 

30 r"Volume (?P<volume>\d+), Number (?P<number>[\d\w\-]+), (?P<year>\d+), pp. (?P<pagestart>\d+)[\-−](?P<pageend>\d+)", 

31 r"\((?P<year>\d+)\).+, (?:(?P<volume>\d+)(?:\((?P<number>[\d\w\-−–]+)\))?).", 

32 ] 

33 

34 def parse_collection_content(self, content): 

35 """ 

36 Parse the HTML page of Annals of Math and returns a list of xissue. 

37 Each xissue has its pid/volume/number/year metadata + its url 

38 """ 

39 data = json.loads(content) 

40 id = data["_id"] 

41 articles = self.query_article_list(id) 

42 issues = self.extract_issue_list(articles) 

43 return issues 

44 

45 def query_article_list(self, id: str): 

46 articles_data = [] 

47 count = float("inf") 

48 while len(articles_data) < count: 

49 request = self.session.post( 

50 "https://buldml.math.bas.bg/api/object/search", 

51 json={ 

52 "search": [ 

53 {"match": {"_t": "ct:Item", "category._rr": id, "*oid": ["category._rr"]}} 

54 ], 

55 "skip": len(articles_data), 

56 "limit": min(200, count - len(articles_data)), 

57 "add": {"identifier": "$identifier", "_alias": "$_alias"}, 

58 "sort": {"issue-date.year": 1, "_id": 1}, 

59 }, 

60 ) 

61 str_data = self.decode_response(request) 

62 data = json.loads(str_data) 

63 count = data["count"] 

64 articles_data.extend(data["data"]) 

65 return articles_data 

66 

67 def extract_issue_list(self, article_list: list[dict]): 

68 issues: dict[str, IssueData] = {} 

69 for _a in article_list: 

70 article = create_articledata() 

71 issue_str = cleanup_str(_a["identifier"]["citation"]["_v"]) 

72 

73 issue_re = None 

74 re_index = 0 

75 MAX_RETRIES = len(self.issue_regexes) 

76 while issue_re is None and re_index < MAX_RETRIES: 

77 issue_re = regex.search(self.issue_regexes[re_index], issue_str) 

78 re_index += 1 

79 

80 if not issue_re: 80 ↛ 81line 80 didn't jump to line 81 because the condition on line 80 was never true

81 raise ValueError( 

82 f"[{self.source_domain}] {self.collection_id} : Article cannot be parsed" 

83 ) 

84 issue_dict = issue_re.groupdict() 

85 

86 if issue_dict.get("pagestart", None): 

87 article.fpage = issue_dict["pagestart"] 

88 if issue_dict.get("pageend", None): 

89 article.lpage = issue_dict["pageend"] 

90 

91 article.url = "https://buldml.math.bas.bg/api/object/a/" + _a["_alias"]["_v"] 

92 

93 pid = self.get_issue_pid( 

94 self.collection_id, 

95 issue_dict["year"], 

96 issue_dict["volume"], 

97 issue_dict.get("number", None), 

98 ) 

99 if pid in issues: 

100 issue = issues[pid] 

101 else: 

102 issue = self.create_xissue( 

103 None, issue_dict["year"], issue_dict["volume"], issue_dict.get("number", None) 

104 ) 

105 issues[pid] = issue 

106 

107 article.pid = "a" + str(len(issues[pid].articles)) 

108 issue.articles.append(article) 

109 return list(issues.values()) 

110 

111 def parse_article_content( 

112 self, 

113 content: str, 

114 xissue: IssueData, 

115 xarticle: ArticleData, 

116 url: str, 

117 ): 

118 data = json.loads(content) 

119 # title 

120 xarticle.title_tex = data["title"]["_v"] 

121 # lang 

122 xarticle.lang = data["language"]["_v"] 

123 if xarticle.lang == "other": 

124 xarticle.lang = "und" 

125 # authors 

126 for _a in data["authors"]["_v"]: 

127 author = create_contributor(string_name=_a, role="author") 

128 xarticle.contributors.append(author) 

129 # abstracts 

130 if "abstract" in data: 

131 xarticle.abstracts.append( 

132 create_abstract( 

133 lang=xarticle.lang, value_tex=data["abstract"]["_v"], tag="abstract" 

134 ) 

135 ) 

136 # keywords 

137 if "subject" in data: 137 ↛ 142line 137 didn't jump to line 142 because the condition on line 137 was always true

138 for _k in data["subject"]["_v"]: 

139 keyword = create_subj(lang=xarticle.lang, value=_k) 

140 xarticle.kwds.append(keyword) 

141 # pdf 

142 if not xarticle.url: 142 ↛ 143line 142 didn't jump to line 143 because the condition on line 142 was never true

143 raise ValueError( 

144 f"[{self.source_domain}] {self.collection_id} : Article doesn't have an url" 

145 ) 

146 if "uri" in data["identifier"]: 

147 url = data["identifier"]["uri"]["_v"] 

148 else: 

149 url = "https://hdl.handle.net/10525/" + data["_alias"]["_v"] 

150 print( 

151 f"[{self.source_domain}] {xarticle.pid} : Article URL not found, falling back with handle.net. Please check link validity : {url}" 

152 ) 

153 xarticle.url = url 

154 if len(data["media"]) == 0: 154 ↛ 155line 154 didn't jump to line 155 because the condition on line 154 was never true

155 raise ValueError(f"[{self.source_domain}] {self.collection_id} : PDF Not found") 

156 for media in data["media"]: 

157 add_pdf_link_to_xarticle( 

158 xarticle, "https://buldml.math.bas.bg/asset/default/" + media["file"]["_v"] 

159 ) 

160 return xarticle