Skip to content

Commit 748a28f

Browse files
committed
First Commit
1 parent 0a04abc commit 748a28f

File tree

3 files changed

+67
-0
lines changed

3 files changed

+67
-0
lines changed

links.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
#!/usr/bin/env python
2+
3+
from BeautifulSoup import BeautifulSoup
4+
#import urllib2
5+
import re # Module for RegEx
6+
import yaml
7+
import cfscrape
8+
9+
def getLinks(url):
10+
# Get the text at the set URL
11+
scraper = cfscrape.create_scraper()
12+
cfurl = scraper.get(url).content
13+
#html_page = urllib2.urlopen(cfurl)
14+
soup = BeautifulSoup(cfurl)
15+
links = ['Full Web Page Internal & External links']
16+
17+
18+
for link in soup.findAll('a', attrs={'href': re.compile("^(http|https)://")}):
19+
links.append(link.get('href'))
20+
return links
21+
22+
#print (getLinks("https://example.com"))
23+
24+
## Output printed in result.yml file
25+
with open('result.yml', 'w') as yaml_file:
26+
yaml.safe_dump((getLinks('https://www.example.com')), yaml_file, default_flow_style=False, encoding='utf-8', allow_unicode=True)
27+
28+
29+
print "done"

og.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
#!/usr/bin/env python
2+
3+
#import urllib2
4+
from BeautifulSoup import BeautifulSoup
5+
import cfscrape
6+
7+
# Get the text at the set URL
8+
scraper = cfscrape.create_scraper()
9+
10+
url = "https://example.com"
11+
cfurl = scraper.get(url).content
12+
#bs = BeautifulSoup(urllib2.urlopen(url))
13+
bs = BeautifulSoup(cfurl)
14+
15+
metatag = bs.find("meta", {"property": "og:image"})
16+
if metatag is not None:
17+
print metatag["content"]
18+
else:
19+
print "This page has no Open Graph meta image tag"

weblink.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
#!/usr/bin/env python
2+
3+
from BeautifulSoup import BeautifulSoup
4+
#import urllib2
5+
import cfscrape
6+
import re
7+
8+
#html_page = urllib2.urlopen("https://example.com")
9+
10+
# Get the text at the set URL
11+
scraper = cfscrape.create_scraper()
12+
13+
url = "https://example.com"
14+
cfurl = scraper.get(url).content
15+
soup = BeautifulSoup(cfurl)
16+
for link in soup.findAll('a', attrs={'href': re.compile("^(http|https)://")}):
17+
18+
## Print Output
19+
print link.get('href')

0 commit comments

Comments
 (0)