Coverage for src/crawler/by_source/lofpl_crawler.py: 85%

79 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2025-01-15 14:09 +0000

1import json 

2 

3import regex 

4from langcodes import standardize_tag 

5from ptf.cmds.xml.jats.builder.citation import get_ext_link_xml 

6from ptf.model_data import ( 

7 ArticleData, 

8 IssueData, 

9 create_abstract, 

10 create_articledata, 

11 create_contributor, 

12) 

13 

14from crawler.base_crawler import BaseCollectionCrawler 

15from crawler.utils import add_pdf_link_to_xarticle 

16 

17 

18class LofplCrawler(BaseCollectionCrawler): 

19 source_name = "Library of Science" 

20 source_domain = "LOFPL" 

21 source_website = "https://bibliotekanauki.pl" 

22 

23 periode_begin = 0 

24 periode_end = 9999 

25 

26 doi_regex = r"DOI: (?P<doi_url>https:\/\/doi.org[^ \n\r]+)" 

27 

28 def parse_collection_content(self, content): 

29 """ 

30 Parse the HTML page of Annals of Math and returns a list of xissue. 

31 Each xissue has its pid/volume/number/year metadata + its url 

32 

33 self.periode is set at the end based on the xissue years of the HTML page 

34 """ 

35 issues = [] 

36 data = json.loads(content) 

37 for entry in data: 

38 link = self.source_website + "/api/issues/" + str(entry["id"]) + "/articles" 

39 year = entry["year"] 

40 volume = entry["volume"] 

41 number = entry["number"] 

42 issue = self.create_xissue(link, year, volume, number) 

43 issues.append(issue) 

44 return issues 

45 

46 def parse_issue_content(self, content: str, xissue: IssueData): 

47 data = json.loads(content) 

48 for index, entry in enumerate(data): 

49 xarticle = create_articledata() 

50 xarticle.pid = "a" + str(index) 

51 xarticle.url = self.source_website + "/api/articles/" + str(entry["id"]) 

52 xissue.articles.append(xarticle) 

53 

54 def parse_article_content( 

55 self, 

56 content: str, 

57 xissue: IssueData, 

58 xarticle: ArticleData, 

59 url: str, 

60 pid: str, 

61 ): 

62 xarticle.pid = pid 

63 

64 data = json.loads(content) 

65 

66 # Titles 

67 xarticle.title_tex = data["mainTitle"]["text"] 

68 xarticle.lang = standardize_tag(data["language"]) 

69 

70 if data["mainTitleTranslations"]: 

71 # PID known to have translations : PMF 

72 print(f"[{pid}]title translations found") 

73 # raise NotImplementedError("title translations found") 

74 

75 # Abstracts 

76 if data["mainAbstract"]["text"] != "": 76 ↛ 85line 76 didn't jump to line 85 because the condition on line 76 was always true

77 xabstract = create_abstract( 

78 tag="abstract", 

79 value_tex=self.latext_parser.latex_to_text(data["mainAbstract"]["text"]), 

80 lang=standardize_tag(data["mainAbstract"]["language"]), 

81 ) 

82 

83 xarticle.abstracts.append(xabstract) 

84 

85 if data["abstractTranslations"]: 85 ↛ 87line 85 didn't jump to line 87 because the condition on line 85 was never true

86 # PID known to have translations : PMF 

87 print(f"[{pid}]abstract translations found") 

88 

89 if data["pageRange"]: 

90 self.set_pages(xarticle, data["pageRange"]) 

91 

92 # Keywords 

93 for keyword in data["keywords"]: 

94 xarticle.kwds.append( 

95 { 

96 "type": "", 

97 "lang": standardize_tag(keyword["language"]), 

98 "value": keyword["text"], 

99 } 

100 ) 

101 

102 # Authors 

103 for a in data["contributors"]: 

104 if a["role"] != "AUTHOR": 104 ↛ 105line 104 didn't jump to line 105 because the condition on line 104 was never true

105 raise NotImplementedError("Author role not implemented") 

106 author = create_contributor() 

107 author["role"] = "author" 

108 # Workaround for https://bibliotekanauki.pl/api/articles/2201019 

109 if a["firstName"] and a["lastName"]: 109 ↛ 113line 109 didn't jump to line 113 because the condition on line 109 was always true

110 author["first_name"] = a["firstName"] 

111 author["last_name"] = a["lastName"] 

112 else: 

113 author["string_name"] = a["lastName"] or a["firstName"] 

114 

115 author["corresponding"] = a["corresponding"] 

116 author["orcid"] = a["orcid"] 

117 xarticle.contributors.append(author) 

118 

119 if len(data["bibEntries"]) > 0: 

120 xarticle.abstracts.append( 

121 self.create_bibliography( 

122 [self.parse_bibitem(bib_entry) for bib_entry in data["bibEntries"]] 

123 ) 

124 ) 

125 

126 add_pdf_link_to_xarticle( 

127 xarticle, self.source_website + "/articles/" + str(data["id"]) + ".pdf" 

128 ) 

129 return xarticle 

130 

131 def crawl_article(self, xarticle: ArticleData, xissue: IssueData): 

132 # Fix: set correct article url 

133 article = super().crawl_article(xarticle, xissue) 

134 ext_link = next(link for link in article.ext_links if link["rel"] == "source") 

135 ext_link["location"] = ext_link["location"].replace("/api", "") 

136 return article 

137 

138 def parse_bibitem(self, text: str): 

139 doi_re = list(regex.finditer(self.doi_regex, text)) 

140 if len(doi_re) == 0: 140 ↛ 142line 140 didn't jump to line 142 because the condition on line 140 was always true

141 return self.create_crawled_bibitem(text) 

142 text = regex.sub(self.doi_regex, "", text) 

143 text = text.removesuffix(", ") 

144 for doi_entry in doi_re: 

145 href = doi_entry.group(1) 

146 text += get_ext_link_xml(href, href.removeprefix("https://doi.org/"), "doi") 

147 

148 return self.create_crawled_bibitem(text)