Coverage for src/crawler/by_source/seio_crawler.py: 76%

113 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2025-02-14 14:36 +0000

1from urllib.parse import urljoin 

2 

3import regex 

4from bs4 import BeautifulSoup 

5from ptf.model_data import create_abstract, create_articledata, create_contributor, create_subj 

6 

7from crawler.base_crawler import BaseCollectionCrawler 

8from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

9 

10 

11# STAT : currently only supports BEIO. 

12# Issue and volume number parsing needs to be adapted for other collections 

13class SeioCrawler(BaseCollectionCrawler): 

14 source_name = "Sociedad de Estadistica e Investigación Operativa" 

15 source_domain = "SEIO" 

16 source_website = "https://www.seio.es/" 

17 

18 issue_re = r"(Número|Number) (?P<number>[\d ,]+) \/ [\w, ]+ (?P<year>\d{4})" 

19 volume_re = r"Volumen? (?P<volume>\d+)" 

20 

21 def parse_collection_content(self, content): 

22 soup = BeautifulSoup(content, "html.parser") 

23 xissues = [] 

24 issues_tags = soup.select(".et_pb_toggle a") 

25 for i in issues_tags: 

26 issue_search = regex.search(self.issue_re, i.text) 

27 if not issue_search: 

28 print(f"Couldn't parse issue data from string : {i.text}") 

29 continue 

30 issue_data = issue_search.groupdict() 

31 

32 parent_tag = i.find_parent(class_="et_pb_toggle") 

33 volume_tag = parent_tag.select_one(".et_pb_toggle_title") 

34 

35 if not volume_tag: 35 ↛ 36line 35 didn't jump to line 36 because the condition on line 35 was never true

36 raise ValueError("Couldn't find volume") 

37 volume_search = regex.search(self.volume_re, volume_tag.text) 

38 if not volume_search: 38 ↛ 39line 38 didn't jump to line 39 because the condition on line 38 was never true

39 raise ValueError("Couldn't parse volume") 

40 volume_dict = volume_search.groupdict() 

41 if volume_dict["volume"] is None: 41 ↛ 42line 41 didn't jump to line 42 because the condition on line 41 was never true

42 raise ValueError("Couldn't parse volume text") 

43 

44 url = i.get("href") 

45 if not isinstance(url, str): 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true

46 raise ValueError("Couldn't parse issue href") 

47 

48 xissues.append( 

49 self.create_xissue( 

50 urljoin(self.collection_url, url), 

51 issue_data["year"], 

52 volume_dict["volume"], 

53 issue_data["number"].replace(", ", "-"), 

54 ) 

55 ) 

56 return xissues 

57 

58 def crawl_issue(self, xissue): 

59 if not xissue.url: 

60 raise ValueError("Error while parsing issue : issue url must be set") 

61 if xissue.url.endswith(".pdf"): 

62 add_pdf_link_to_xarticle(xissue, xissue.url) 

63 xissue.url = None 

64 return super().crawl_issue(xissue) 

65 

66 def parse_issue_content(self, content, xissue): 

67 soup = BeautifulSoup(content, "html.parser") 

68 articles_tags = soup.select("h2.entry-title a") 

69 for index, article_tag in enumerate(articles_tags): 

70 xarticle = create_articledata() 

71 xarticle.pid = f"a{index}" 

72 url = article_tag.get("href") 

73 if not isinstance(url, str): 73 ↛ 74line 73 didn't jump to line 74 because the condition on line 73 was never true

74 raise ValueError("Couldn't parse article href") 

75 xarticle.url = url 

76 xissue.articles.append(xarticle) 

77 

78 def parse_article_content(self, content, xissue, xarticle, url, pid): 

79 soup = BeautifulSoup(content, "html.parser") 

80 title_tag = soup.select_one("div#main-content h1.br--text") 

81 if title_tag is None: 81 ↛ 82line 81 didn't jump to line 82 because the condition on line 81 was never true

82 raise ValueError("Couldn't parse title") 

83 xarticle.title_tex = cleanup_str(title_tag.text) 

84 

85 keywords_tag = soup.select_one("div#main-content .keywords") 

86 if keywords_tag: 

87 keyword_header = keywords_tag.select_one("strong") 

88 if keyword_header: 88 ↛ 91line 88 didn't jump to line 91 because the condition on line 88 was always true

89 keyword_header.decompose() 

90 

91 kwd_text = cleanup_str(keywords_tag.text) 

92 

93 for kwd_str in kwd_text.split(", "): 

94 xarticle.kwds.append(create_subj(value=kwd_str)) 

95 keywords_tag.decompose() 

96 

97 # WARN : some abstracts have multiple paragraphs and/or lists 

98 # https://www.seio.es/beio/grupo-de-tratamiento-y-analisis-de-big-data-tabida/ 

99 abstract_tag = soup.select_one("div#main-content .abstract-section") 

100 if abstract_tag: 

101 abstract_header = abstract_tag.select_one("h3") 

102 if abstract_header: 102 ↛ 104line 102 didn't jump to line 104 because the condition on line 102 was always true

103 abstract_header.decompose() 

104 abstract_str = abstract_tag.text 

105 xarticle.lang = self.detect_language(abstract_str) 

106 xarticle.abstracts.append(create_abstract(tag="abstract", value_tex=abstract_str)) 

107 

108 # Unfortunately, SEIO authors are formatted differently depending on the issue. 

109 # We can try to find as much as we can, but it's sometimes not straightforward to do so 

110 # Those cases are currently ignored, thus no author will appear on Geodesic. 

111 author_tags = soup.select("div#main-content .author-info .author > p") 

112 if not author_tags or len(author_tags) == 0: 

113 author_tags = soup.select("div#main-content p.author") 

114 

115 for author_tag in author_tags: 

116 contributor = create_contributor(role="author") 

117 

118 author_name = ( 

119 author_tag.select_one("strong") 

120 or author_tag.select_one("span:first-child > code") 

121 or author_tag.select_one("code:first-child") 

122 ) 

123 if not author_name: 123 ↛ 124line 123 didn't jump to line 124 because the condition on line 123 was never true

124 print(f"Couldn't find author name: {url}") 

125 continue 

126 

127 contributor["string_name"] = author_name.text 

128 author_name.decompose() 

129 

130 orcid_tag = author_tag.select_one("a[href^='https://orcid.org']") 

131 if orcid_tag: 131 ↛ 142line 131 didn't jump to line 142 because the condition on line 131 was always true

132 orcid_url = orcid_tag.get("href") 

133 if not isinstance(orcid_url, str): 133 ↛ 134line 133 didn't jump to line 134 because the condition on line 133 was never true

134 raise ValueError("Found orcid tag, but couldn't parse href") 

135 

136 orcid_search = regex.search(self.orcid_re, orcid_url) 

137 if not orcid_search: 137 ↛ 138line 137 didn't jump to line 138 because the condition on line 137 was never true

138 raise ValueError("Found orcid url, but couldn't parse it.") 

139 contributor["orcid"] = orcid_search.groupdict()["orcid"] 

140 orcid_tag.decompose() 

141 

142 affiliation = next( 

143 (e.text for e in author_tag if str(e.text) not in ("<br/>", "\n", ".")), None 

144 ) 

145 

146 if isinstance(affiliation, str): 146 ↛ 151line 146 didn't jump to line 151 because the condition on line 146 was always true

147 affiliation = cleanup_str(affiliation) 

148 if len(affiliation) > 0: 148 ↛ 149line 148 didn't jump to line 149 because the condition on line 148 was never true

149 contributor["addresses"].append(cleanup_str(affiliation)) 

150 

151 xarticle.contributors.append(contributor) 

152 

153 pdf_button = soup.select_one("#main-content a.et_pb_button:-soup-contains('PDF')") 

154 if not pdf_button: 154 ↛ 155line 154 didn't jump to line 155 because the condition on line 154 was never true

155 raise ValueError("Couldn't find PDF link") 

156 pdf_url = pdf_button.get("href") 

157 if not isinstance(pdf_url, str): 157 ↛ 158line 157 didn't jump to line 158 because the condition on line 157 was never true

158 raise ValueError("Couldn't parse pdf url") 

159 add_pdf_link_to_xarticle(xarticle, pdf_url) 

160 return super().parse_article_content(content, xissue, xarticle, url, pid) #