Coverage for src/crawler/by_source/nsjom/nsjom_2010_crawler.py: 80%
110 statements
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-20 09:03 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-20 09:03 +0000
1import re
2import typing
3from datetime import datetime
5from bs4 import BeautifulSoup, Tag
6from ptf.model_data import (
7 IssueData,
8 create_articledata,
9 create_contributor,
10 create_extlink,
11 create_issuedata,
12)
14from crawler.base_crawler import add_pdf_link_to_xarticle
15from crawler.utils import cleanup_str
17if typing.TYPE_CHECKING: 17 ↛ 18line 17 didn't jump to line 18 because the condition on line 17 was never true
18 from ..nsjom_crawler import NsjomCrawler
20source_domain = "NSJOM"
23def parse_collection_content(
24 self: "NsjomCrawler",
25 content: str,
26 periode_start: int = 0,
27 periode_end: int = datetime.now().year,
28 source_domain: str = "NSJOM",
29):
30 """
31 Parses all articles from year-specific webpages : https://sites.dmi.uns.ac.rs/nsjom/ns2010.html
32 From 2010 to 2014 (included)
33 """
34 xissues: list[IssueData] = []
35 year_start = max(2010, periode_start)
36 year_end = min(2014, periode_end)
38 for year in range(year_start, year_end + 1):
39 url = f"https://sites.dmi.uns.ac.rs/nsjom/ns{year}.html"
40 year_content = self.get_page_content(url)
41 xissues = xissues + parse_year(self, year_content, year, url)
43 return xissues
46def is_heading(element: Tag):
47 if element.select_one(".HeadingNSJOM"):
48 return True
49 classname = element.get("class")
50 if not classname:
51 return False
53 if isinstance(classname, str): 53 ↛ 54line 53 didn't jump to line 54 because the condition on line 53 was never true
54 if classname == "HeadingNSJOM":
55 return True
56 return False
58 if "HeadingNSJOM" in classname:
59 return True
60 return False
63def parse_issue_content(self, content: str, xissue: IssueData):
64 parse_year(self, content, int(xissue.year), xissue.url, xissue.pid)
67def parse_year(
68 self: "NsjomCrawler",
69 content: str,
70 year: int,
71 url: str | None = None,
72 pid_to_crawl: str | None = None,
73):
74 soup = BeautifulSoup(content, "html.parser")
75 page_elements = soup.select("p.HeadingNSJOM, p.style1, p.style1+blockquote a")
76 issues: list[list[Tag]] = []
77 # Sort tags into issues
78 for current_element in page_elements:
79 if is_heading(current_element):
80 issues.append([current_element])
81 continue
82 if current_element.text == "\xa0":
83 continue
84 issues[-1].append(current_element)
86 xissues: list[IssueData] = []
87 for issue_elements in issues:
88 xissue = parse_issue_tags(self, issue_elements, year, pid_to_crawl)
89 if not xissue: 89 ↛ 90line 89 didn't jump to line 90 because the condition on line 89 was never true
90 continue
91 xissue.url = url
92 xissues.append(xissue)
94 return xissues
97def parse_issue_tags(
98 self: "NsjomCrawler", tags: list[Tag], year: int, pid_to_crawl: str | None = None
99):
100 xissue = create_issuedata()
101 xissue.year = str(year)
102 ext_link = create_extlink(
103 rel="source",
104 location=f"https://sites.dmi.uns.ac.rs/nsjom/issue.html?year={year}",
105 metadata=source_domain,
106 )
107 xissue.ext_links.append(ext_link)
108 issue_title_tag = tags.pop(0)
109 match = re.search(
110 r"NSJOM Vol\. (?P<Volume>\d+)(?:, No. (?P<Issue>\d+))?", issue_title_tag.text
111 )
112 if match is None: 112 ↛ 113line 112 didn't jump to line 113 because the condition on line 112 was never true
113 raise ValueError("Cannot find volume number")
114 volume_number = match.group("Volume")
115 issue_number = match.group("Issue")
116 if volume_number: 116 ↛ 119line 116 didn't jump to line 119 because the condition on line 116 was always true
117 xissue.volume = volume_number
118 else:
119 raise ValueError("Cannot read volume number")
120 if issue_number: 120 ↛ 122line 120 didn't jump to line 122 because the condition on line 120 was always true
121 xissue.number = issue_number
122 xissue.pid = f"{source_domain}_{year}_{volume_number}_{issue_number}"
124 if pid_to_crawl and xissue.pid != pid_to_crawl: 124 ↛ 125line 124 didn't jump to line 125 because the condition on line 124 was never true
125 return
127 while len(tags) > 0:
128 article_meta = tags.pop(0)
129 article_meta_text = cleanup_str(article_meta.text)
130 if article_meta_text == "": 130 ↛ 131line 130 didn't jump to line 131 because the condition on line 130 was never true
131 continue
133 if len(tags) == 0: 133 ↛ 134line 133 didn't jump to line 134 because the condition on line 133 was never true
134 continue
135 article_title = tags.pop(0)
136 article = parse_article(
137 self,
138 article_meta_text,
139 article_title,
140 xissue.pid,
141 len(xissue.articles),
142 f"https://sites.dmi.uns.ac.rs/nsjom/issue.html?year={year}",
143 )
144 if article is None: 144 ↛ 145line 144 didn't jump to line 145 because the condition on line 144 was never true
145 continue
146 xissue.articles.append(article)
147 return xissue
150def parse_article(
151 self: "NsjomCrawler",
152 meta_text: str,
153 title_tag: Tag,
154 issue_pid: str,
155 index: int,
156 source: str | None = None,
157):
158 xarticle = create_articledata()
159 if source: 159 ↛ 162line 159 didn't jump to line 162 because the condition on line 159 was always true
160 ext_link = create_extlink(rel="source", location=source, metadata=self.source_domain)
161 xarticle.ext_links.append(ext_link)
162 match = re.search(r"\d+ \/ \d+ \/ \d+ \/ (?P<Pages>\d+(?:-\d+)?): (?P<Authors>.+)", meta_text)
163 if match is None: 163 ↛ 164line 163 didn't jump to line 164 because the condition on line 163 was never true
164 raise ValueError("Cannot parse authors or page number")
165 xarticle.page_range = match.group("Pages")
166 authors = re.findall(r"(?: and )?((?:(?<!,)(?<! and).(?!and ))+)", match.group("Authors"))
167 for a in authors:
168 author = create_contributor(role="author", string_name=a)
169 xarticle.contributors.append(author)
171 article_pdf_link = title_tag.get("href")
172 if article_pdf_link is None: 172 ↛ 173line 172 didn't jump to line 173 because the condition on line 172 was never true
173 print("[NSJOM] article does not have a pdf")
174 return None
175 if isinstance(article_pdf_link, list): 175 ↛ 176line 175 didn't jump to line 176 because the condition on line 175 was never true
176 raise ValueError("Article link is a list")
177 pdf_link = self.source_website + article_pdf_link
178 add_pdf_link_to_xarticle(xarticle, pdf_link)
180 title = cleanup_str(title_tag.text)
181 xarticle.title_tex = title
182 xarticle.pid = f"{issue_pid}a_{index}"
184 return xarticle