Coverage for src/crawler/by_source/seio_crawler.py: 77%

116 statements  

« prev     ^ index     » next       coverage.py v7.7.0, created at 2025-04-03 12:36 +0000

1from urllib.parse import urljoin 

2 

3import lingua 

4import regex 

5from bs4 import BeautifulSoup 

6from lingua import LanguageDetectorBuilder 

7from ptf.model_data import create_abstract, create_articledata, create_contributor, create_subj 

8 

9from crawler.base_crawler import BaseCollectionCrawler 

10from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

11 

12 

13# STAT : currently only supports BEIO. 

14# Issue and volume number parsing needs to be adapted for other collections 

15class SeioCrawler(BaseCollectionCrawler): 

16 source_name = "Sociedad de Estadistica e Investigación Operativa" 

17 source_domain = "SEIO" 

18 source_website = "https://www.seio.es/" 

19 

20 issue_re = r"(Número|Number) (?P<number>[\d ,]+) \/ [\w, ]+ (?P<year>\d{4})" 

21 volume_re = r"Volumen? (?P<volume>\d+)" 

22 

23 language_detector = LanguageDetectorBuilder.from_languages( 

24 lingua.Language.ENGLISH, lingua.Language.SPANISH, lingua.Language.FRENCH 

25 ).build() 

26 

27 def parse_collection_content(self, content): 

28 soup = BeautifulSoup(content, "html.parser") 

29 xissues = [] 

30 issues_tags = soup.select(".et_pb_toggle a") 

31 for i in issues_tags: 

32 issue_search = regex.search(self.issue_re, i.text) 

33 if not issue_search: 

34 print(f"Couldn't parse issue data from string : {i.text}") 

35 continue 

36 issue_data = issue_search.groupdict() 

37 

38 parent_tag = i.find_parent(class_="et_pb_toggle") 

39 volume_tag = parent_tag.select_one(".et_pb_toggle_title") 

40 

41 if not volume_tag: 41 ↛ 42line 41 didn't jump to line 42 because the condition on line 41 was never true

42 raise ValueError("Couldn't find volume") 

43 volume_search = regex.search(self.volume_re, volume_tag.text) 

44 if not volume_search: 44 ↛ 45line 44 didn't jump to line 45 because the condition on line 44 was never true

45 raise ValueError("Couldn't parse volume") 

46 volume_dict = volume_search.groupdict() 

47 if volume_dict["volume"] is None: 47 ↛ 48line 47 didn't jump to line 48 because the condition on line 47 was never true

48 raise ValueError("Couldn't parse volume text") 

49 

50 url = i.get("href") 

51 if not isinstance(url, str): 51 ↛ 52line 51 didn't jump to line 52 because the condition on line 51 was never true

52 raise ValueError("Couldn't parse issue href") 

53 

54 xissues.append( 

55 self.create_xissue( 

56 urljoin(self.collection_url, url), 

57 issue_data["year"], 

58 volume_dict["volume"], 

59 issue_data["number"].replace(", ", "-"), 

60 ) 

61 ) 

62 return xissues 

63 

64 def crawl_issue(self, xissue): 

65 if not xissue.url: 

66 raise ValueError("Error while parsing issue : issue url must be set") 

67 if xissue.url.endswith(".pdf"): 

68 add_pdf_link_to_xarticle(xissue, xissue.url) 

69 xissue.url = None 

70 return super().crawl_issue(xissue) 

71 

72 def parse_issue_content(self, content, xissue): 

73 soup = BeautifulSoup(content, "html.parser") 

74 articles_tags = soup.select("h2.entry-title a") 

75 for index, article_tag in enumerate(articles_tags): 

76 xarticle = create_articledata() 

77 xarticle.pid = f"a{index}" 

78 url = article_tag.get("href") 

79 if not isinstance(url, str): 79 ↛ 80line 79 didn't jump to line 80 because the condition on line 79 was never true

80 raise ValueError("Couldn't parse article href") 

81 xarticle.url = url 

82 xissue.articles.append(xarticle) 

83 

84 def parse_article_content(self, content, xissue, xarticle, url): 

85 soup = BeautifulSoup(content, "html.parser") 

86 title_tag = soup.select_one("div#main-content h1.br--text") 

87 if title_tag is None: 87 ↛ 88line 87 didn't jump to line 88 because the condition on line 87 was never true

88 raise ValueError("Couldn't parse title") 

89 xarticle.title_tex = cleanup_str(title_tag.text) 

90 

91 keywords_tag = soup.select_one("div#main-content .keywords") 

92 if keywords_tag: 

93 keyword_header = keywords_tag.select_one("strong") 

94 if keyword_header: 94 ↛ 97line 94 didn't jump to line 97 because the condition on line 94 was always true

95 keyword_header.decompose() 

96 

97 kwd_text = cleanup_str(keywords_tag.text) 

98 

99 for kwd_str in kwd_text.split(", "): 

100 xarticle.kwds.append(create_subj(value=kwd_str)) 

101 keywords_tag.decompose() 

102 

103 # WARN : some abstracts have multiple paragraphs and/or lists 

104 # https://www.seio.es/beio/grupo-de-tratamiento-y-analisis-de-big-data-tabida/ 

105 abstract_tag = soup.select_one("div#main-content .abstract-section") 

106 if abstract_tag: 

107 abstract_header = abstract_tag.select_one("h3") 

108 if abstract_header: 108 ↛ 110line 108 didn't jump to line 110 because the condition on line 108 was always true

109 abstract_header.decompose() 

110 abstract_str = abstract_tag.text 

111 xarticle.lang = self.detect_language(abstract_str) 

112 xarticle.abstracts.append(create_abstract(tag="abstract", value_tex=abstract_str)) 

113 

114 # Unfortunately, SEIO authors are formatted differently depending on the issue. 

115 # We can try to find as much as we can, but it's sometimes not straightforward to do so 

116 # Those cases are currently ignored, thus no author will appear on Geodesic. 

117 author_tags = soup.select("div#main-content .author-info .author > p") 

118 if not author_tags or len(author_tags) == 0: 

119 author_tags = soup.select("div#main-content p.author") 

120 

121 for author_tag in author_tags: 

122 contributor = create_contributor(role="author") 

123 

124 author_name = ( 

125 author_tag.select_one("strong") 

126 or author_tag.select_one("span:first-child > code") 

127 or author_tag.select_one("code:first-child") 

128 ) 

129 if not author_name: 129 ↛ 130line 129 didn't jump to line 130 because the condition on line 129 was never true

130 print(f"Couldn't find author name: {url}") 

131 continue 

132 

133 contributor["string_name"] = author_name.text 

134 author_name.decompose() 

135 

136 orcid_tag = author_tag.select_one("a[href^='https://orcid.org']") 

137 if orcid_tag: 137 ↛ 148line 137 didn't jump to line 148 because the condition on line 137 was always true

138 orcid_url = orcid_tag.get("href") 

139 if not isinstance(orcid_url, str): 139 ↛ 140line 139 didn't jump to line 140 because the condition on line 139 was never true

140 raise ValueError("Found orcid tag, but couldn't parse href") 

141 

142 orcid_search = regex.search(self.orcid_re, orcid_url) 

143 if not orcid_search: 143 ↛ 144line 143 didn't jump to line 144 because the condition on line 143 was never true

144 raise ValueError("Found orcid url, but couldn't parse it.") 

145 contributor["orcid"] = orcid_search.groupdict()["orcid"] 

146 orcid_tag.decompose() 

147 

148 affiliation = next( 

149 (e.text for e in author_tag if str(e.text) not in ("<br/>", "\n", ".")), None 

150 ) 

151 

152 if isinstance(affiliation, str): 152 ↛ 157line 152 didn't jump to line 157 because the condition on line 152 was always true

153 affiliation = cleanup_str(affiliation) 

154 if len(affiliation) > 0: 154 ↛ 155line 154 didn't jump to line 155 because the condition on line 154 was never true

155 contributor["addresses"].append(cleanup_str(affiliation)) 

156 

157 xarticle.contributors.append(contributor) 

158 

159 pdf_button = soup.select_one("#main-content a.et_pb_button:-soup-contains('PDF')") 

160 if not pdf_button: 160 ↛ 161line 160 didn't jump to line 161 because the condition on line 160 was never true

161 raise ValueError("Couldn't find PDF link") 

162 pdf_url = pdf_button.get("href") 

163 if not isinstance(pdf_url, str): 163 ↛ 164line 163 didn't jump to line 164 because the condition on line 163 was never true

164 raise ValueError("Couldn't parse pdf url") 

165 add_pdf_link_to_xarticle(xarticle, pdf_url) 

166 return xarticle