Coverage for src/crawler/by_source/seio_crawler.py: 77%

113 statements  

« prev     ^ index     » next       coverage.py v7.8.2, created at 2025-06-16 07:44 +0000

1from urllib.parse import urljoin 

2 

3import lingua 

4import regex 

5from bs4 import BeautifulSoup 

6from lingua import LanguageDetectorBuilder 

7from ptf.model_data import create_abstract, create_articledata, create_contributor, create_subj 

8 

9from crawler.base_crawler import BaseCollectionCrawler 

10from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, regex_to_dict 

11 

12 

13# STAT : currently only supports BEIO. 

14# Issue and volume number parsing needs to be adapted for other collections 

15class SeioCrawler(BaseCollectionCrawler): 

16 source_name = "Sociedad de Estadistica e Investigación Operativa" 

17 source_domain = "SEIO" 

18 source_website = "https://www.seio.es/" 

19 

20 issue_re = r"(Número|Number) (?P<number>[\d ,]+) \/ [\w, ]+ (?P<year>\d{4})" 

21 volume_re = r"Volumen? (?P<volume>\d+)" 

22 

23 language_detector = LanguageDetectorBuilder.from_languages( 

24 lingua.Language.ENGLISH, lingua.Language.SPANISH, lingua.Language.FRENCH 

25 ).build() 

26 

27 def parse_collection_content(self, content): 

28 soup = BeautifulSoup(content, "html.parser") 

29 xissues = [] 

30 issues_tags = soup.select(".et_pb_toggle a") 

31 for i in issues_tags: 

32 issue_search = regex.search(self.issue_re, i.text) 

33 if not issue_search: 

34 print(f"Couldn't parse issue data from string : {i.text}") 

35 continue 

36 issue_data = issue_search.groupdict() 

37 

38 parent_tag = i.find_parent(class_="et_pb_toggle") 

39 volume_tag = parent_tag.select_one(".et_pb_toggle_title") 

40 

41 if not volume_tag: 41 ↛ 42line 41 didn't jump to line 42 because the condition on line 41 was never true

42 raise ValueError("Couldn't find volume") 

43 

44 volume_dict = regex_to_dict( 

45 self.volume_re, 

46 volume_tag.text, 

47 error_msg="Couldn't parse volume", 

48 ) 

49 if volume_dict["volume"] is None: 49 ↛ 50line 49 didn't jump to line 50 because the condition on line 49 was never true

50 raise ValueError("Couldn't parse volume text") 

51 

52 url = i.get("href") 

53 if not isinstance(url, str): 53 ↛ 54line 53 didn't jump to line 54 because the condition on line 53 was never true

54 raise ValueError("Couldn't parse issue href") 

55 

56 xissues.append( 

57 self.create_xissue( 

58 urljoin(self.collection_url, url), 

59 issue_data["year"], 

60 volume_dict["volume"], 

61 issue_data["number"].replace(", ", "-"), 

62 ) 

63 ) 

64 return xissues 

65 

66 def crawl_issue(self, xissue): 

67 if not xissue.url: 

68 raise ValueError("Error while parsing issue : issue url must be set") 

69 if xissue.url.endswith(".pdf"): 

70 add_pdf_link_to_xarticle(xissue, xissue.url) 

71 xissue.url = None 

72 return super().crawl_issue(xissue) 

73 

74 def parse_issue_content(self, content, xissue): 

75 soup = BeautifulSoup(content, "html.parser") 

76 articles_tags = soup.select("h2.entry-title a") 

77 for index, article_tag in enumerate(articles_tags): 

78 xarticle = create_articledata() 

79 xarticle.pid = f"a{index}" 

80 url = article_tag.get("href") 

81 if not isinstance(url, str): 81 ↛ 82line 81 didn't jump to line 82 because the condition on line 81 was never true

82 raise ValueError("Couldn't parse article href") 

83 xarticle.url = url 

84 xissue.articles.append(xarticle) 

85 

86 def parse_article_content(self, content, xissue, xarticle, url): 

87 soup = BeautifulSoup(content, "html.parser") 

88 title_tag = soup.select_one("div#main-content h1.br--text") 

89 if title_tag is None: 89 ↛ 90line 89 didn't jump to line 90 because the condition on line 89 was never true

90 raise ValueError("Couldn't parse title") 

91 xarticle.title_tex = cleanup_str(title_tag.text) 

92 

93 keywords_tag = soup.select_one("div#main-content .keywords") 

94 if keywords_tag: 

95 keyword_header = keywords_tag.select_one("strong") 

96 if keyword_header: 96 ↛ 99line 96 didn't jump to line 99 because the condition on line 96 was always true

97 keyword_header.decompose() 

98 

99 kwd_text = cleanup_str(keywords_tag.text) 

100 

101 for kwd_str in kwd_text.split(", "): 

102 xarticle.kwds.append(create_subj(value=kwd_str)) 

103 keywords_tag.decompose() 

104 

105 # WARN : some abstracts have multiple paragraphs and/or lists 

106 # https://www.seio.es/beio/grupo-de-tratamiento-y-analisis-de-big-data-tabida/ 

107 abstract_tag = soup.select_one("div#main-content .abstract-section") 

108 if abstract_tag: 

109 abstract_header = abstract_tag.select_one("h3") 

110 if abstract_header: 110 ↛ 112line 110 didn't jump to line 112 because the condition on line 110 was always true

111 abstract_header.decompose() 

112 abstract_str = abstract_tag.text 

113 xarticle.lang = self.detect_language(abstract_str) 

114 xarticle.abstracts.append(create_abstract(tag="abstract", value_tex=abstract_str)) 

115 

116 # Unfortunately, SEIO authors are formatted differently depending on the issue. 

117 # We can try to find as much as we can, but it's sometimes not straightforward to do so 

118 # Those cases are currently ignored, thus no author will appear on Geodesic. 

119 author_tags = soup.select("div#main-content .author-info .author > p") 

120 if not author_tags or len(author_tags) == 0: 

121 author_tags = soup.select("div#main-content p.author") 

122 

123 for author_tag in author_tags: 

124 contributor = create_contributor(role="author") 

125 

126 author_name = ( 

127 author_tag.select_one("strong") 

128 or author_tag.select_one("span:first-child > code") 

129 or author_tag.select_one("code:first-child") 

130 ) 

131 if not author_name: 131 ↛ 132line 131 didn't jump to line 132 because the condition on line 131 was never true

132 print(f"Couldn't find author name: {url}") 

133 continue 

134 

135 contributor["string_name"] = author_name.text 

136 author_name.decompose() 

137 

138 orcid_tag = author_tag.select_one("a[href^='https://orcid.org']") 

139 if orcid_tag: 139 ↛ 150line 139 didn't jump to line 150 because the condition on line 139 was always true

140 orcid_url = orcid_tag.get("href") 

141 if not isinstance(orcid_url, str): 141 ↛ 142line 141 didn't jump to line 142 because the condition on line 141 was never true

142 raise ValueError("Found orcid tag, but couldn't parse href") 

143 

144 orcid_search = regex.search(self.orcid_re, orcid_url) 

145 if not orcid_search: 145 ↛ 146line 145 didn't jump to line 146 because the condition on line 145 was never true

146 raise ValueError("Found orcid url, but couldn't parse it.") 

147 contributor["orcid"] = orcid_search.groupdict()["orcid"] 

148 orcid_tag.decompose() 

149 

150 affiliation = next( 

151 (e.text for e in author_tag if str(e.text) not in ("<br/>", "\n", ".")), None 

152 ) 

153 

154 if isinstance(affiliation, str): 154 ↛ 159line 154 didn't jump to line 159 because the condition on line 154 was always true

155 affiliation = cleanup_str(affiliation) 

156 if len(affiliation) > 0: 156 ↛ 157line 156 didn't jump to line 157 because the condition on line 156 was never true

157 contributor["addresses"].append(cleanup_str(affiliation)) 

158 

159 xarticle.contributors.append(contributor) 

160 

161 pdf_button = soup.select_one("#main-content a.et_pb_button:-soup-contains('PDF')") 

162 if not pdf_button: 162 ↛ 163line 162 didn't jump to line 163 because the condition on line 162 was never true

163 raise ValueError("Couldn't find PDF link") 

164 pdf_url = pdf_button.get("href") 

165 if not isinstance(pdf_url, str): 165 ↛ 166line 165 didn't jump to line 166 because the condition on line 165 was never true

166 raise ValueError("Couldn't parse pdf url") 

167 add_pdf_link_to_xarticle(xarticle, pdf_url) 

168 return xarticle