Skip to content

Commit

Permalink
Merge pull request #15 from ItsIgnacioPortal/main
Browse files Browse the repository at this point in the history
Several improvements
  • Loading branch information
utkusen authored Sep 23, 2022
2 parents 020010d + 161bbb2 commit b988e80
Show file tree
Hide file tree
Showing 3 changed files with 107 additions and 52 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
archives/
test.txt
output.txt
keywords.txt
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ urlhunter requires 3 parameters to run: `-keywords`, `-date` and `-o`.

For example: `urlhunter -keywords keywords.txt -date 2020-11-20 -o out.txt`

### -keywords
### --keywords

You need to specify the txt file that contains keywords to search on URLs. Keywords must be written line by line. You have three different ways to specify keywords:

Expand All @@ -55,7 +55,7 @@ You need to specify the txt file that contains keywords to search on URLs. Keywo

`regex 1\d{10}` will match `https://example.com/index.php?id=12938454312` but **_won't_** match `https://example.com/index.php?id=abc223`

### -date
### --date

urlhunter downloads the archive files of the given date(s). You have three different ways to specify the date:

Expand All @@ -69,7 +69,7 @@ For example: `-date 2020-11-20`

For example: `-date 2020-11-10:2020-11-20`

### -o
### --output

You can specify the output file with `-o` parameter. For example `-o out.txt`

Expand Down
149 changes: 100 additions & 49 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,16 +42,50 @@ type Files struct {
} `xml:"file"`
}

const usage = `Usage: ./urlhunter --keywords /path/to/keywordsFile --date DATE-RANGE-HERE --output /path/to/outputFile [--archives /path/to/archives]
Example: ./urlhunter --keywords keywords.txt --date 2020-11-20 --output out.txt
-k, --keywords /path/to/keywordsFile
Path to a file that contains strings to search.
-d, --date DATE-RANGE-HERE
You may specify either a single date, or a range; Using a single date will set the present as the end of the range.
Single date: "2020-11-20". Range: "2020-11-10:2020-11-20".
-o, --output /path/to/outputFile
Path to a file where the output will be written.
-a, --archives /path/to/archives
Path to the directory where you're storing your archive files. If this is your first time running this tool, the archives will be downloaded on a new ./archives folder
`

var err error
var archivesPath string

func main() {
keywordFile := flag.String("keywords", "", "A txt file that contains strings to search.")
dateParam := flag.String("date", "", "A single date or a range to search. Single: YYYY-MM-DD Range:YYYY-MM-DD:YYYY-MM-DD")
outFile := flag.String("o", "", "Output file")

var keywordFile string
var dateParam string
var outFile string

flag.StringVar(&keywordFile, "k", "", "A txt file that contains strings to search.")
flag.StringVar(&keywordFile, "keywords", "", "A txt file that contains strings to search.")
flag.StringVar(&dateParam, "d", "", "A single date or a range to search. Single: YYYY-MM-DD Range:YYYY-MM-DD:YYYY-MM-DD")
flag.StringVar(&dateParam, "date", "", "A single date or a range to search. Single: YYYY-MM-DD Range:YYYY-MM-DD:YYYY-MM-DD")
flag.StringVar(&outFile, "o", "", "Output file")
flag.StringVar(&outFile, "output", "", "Output file")
flag.StringVar(&archivesPath, "a", "archives", "Archives file path")
flag.StringVar(&archivesPath, "archives", "archives", "Archives file path")

//https://www.antoniojgutierrez.com/posts/2021-05-14-short-and-long-options-in-go-flags-pkg/
flag.Usage = func() { fmt.Print(usage) }
flag.Parse()
if *keywordFile == "" || *dateParam == "" || *outFile == "" {
color.Red("Please specify all arguments!")
flag.PrintDefaults()


if keywordFile == "" || dateParam == "" || outFile == "" {
crash("You must specify all arguments.", err)
return
}

fmt.Println(`
o Utku Sen's
\_/\o
Expand All @@ -63,35 +97,32 @@ func main() {
{K || twitter.com/utkusen
`)
_ = os.Mkdir("archives", os.ModePerm)
if strings.Contains(*dateParam, ":") {
startDate, err := time.Parse("2006-01-02", strings.Split(*dateParam, ":")[0])
_ = os.Mkdir(archivesPath, os.ModePerm)
if strings.Contains(dateParam, ":") {
startDate, err := time.Parse("2006-01-02", strings.Split(dateParam, ":")[0])
if err != nil {
color.Red("Wrong date format!")
return
crash("Wrong date format!", err)
}
endDate, err := time.Parse("2006-01-02", strings.Split(*dateParam, ":")[1])
endDate, err := time.Parse("2006-01-02", strings.Split(dateParam, ":")[1])
if err != nil {
color.Red("Wrong date format!")
return
crash("Wrong date format!", err)
}
for rd := rangeDate(startDate, endDate); ; {
date := rd()
if date.IsZero() {
break
}
getArchive(getArchiveList(), string(date.Format("2006-01-02")), *keywordFile, *outFile)
getArchive(getArchiveList(), string(date.Format("2006-01-02")), keywordFile, outFile)
}
} else {
if *dateParam != "latest" {
_, err := time.Parse("2006-01-02", *dateParam)
if dateParam != "latest" {
_, err := time.Parse("2006-01-02", dateParam)
if err != nil {
color.Red("Wrong date format!")
return
crash("Wrong date format!", err)
}
}

getArchive(getArchiveList(), *dateParam, *keywordFile, *outFile)
getArchive(getArchiveList(), dateParam, keywordFile, outFile)
}
color.Green("Search complete!")
}
Expand All @@ -110,7 +141,7 @@ func getArchiveList() []byte {
}

func getArchive(body []byte, date string, keywordFile string, outfile string) {
fmt.Println("Search starting for: " + date)
color.Cyan("Search starting for: " + date)
type Response struct {
Items []struct {
Identifier string `json:"identifier"`
Expand All @@ -137,47 +168,45 @@ func getArchive(body []byte, date string, keywordFile string, outfile string) {
}

if !flag {
color.Red("Couldn't find an archive with that date!")
info("Couldn't find an archive with that date.")
return
}
dumpFiles := archiveMetadata(fullname)
if ifArchiveExists(fullname) {
color.Cyan(fullname + " Archive already exists!")
info(fullname + " already exists locally. Skipping download..")
} else {
for _, item := range dumpFiles.File {
dumpFilepath, _ := filepath.Glob(filepath.Join("archives", fullname, item.DumpType, "*.txt"))
dumpFilepath, _ := filepath.Glob(filepath.Join(archivesPath, fullname, item.DumpType, "*.txt"))
if len(dumpFilepath) > 0 {
_ = os.Remove(dumpFilepath[0])
}

if !fileExists(filepath.Join("archives", fullname, item.Name)) {
color.Red(item.Name + " doesn't exist locally.")
if !fileExists(filepath.Join(archivesPath, fullname, item.Name)) {
info(item.Name + " doesn't exist locally. The file will be downloaded.")
url1 := "https://archive.org/download/" + fullname + "/" + item.Name
downloadFile(url1)
}

color.Magenta("Unzipping: " + item.Name)
_, err := Unzip(filepath.Join("archives", fullname, item.Name), filepath.Join("archives", fullname))
info("Unzipping: " + item.Name)
_, err := Unzip(filepath.Join(archivesPath, fullname, item.Name), filepath.Join(archivesPath, fullname))
if err != nil {
color.Red(item.Name + " looks damaged. It's removed now. Run the program again to re-download.")
os.Remove(filepath.Join("archives", fullname, item.Name))
os.Exit(1)
os.Remove(filepath.Join(archivesPath, fullname, item.Name))
crash(item.Name + " looks damaged. It's removed now. Run the program again to re-download.", err)
}
}

color.Cyan("Decompressing XZ Archives..")
info("Decompressing XZ Archives..")
for _, item := range dumpFiles.File {
tarfile, _ := filepath.Glob(filepath.Join("archives", fullname, item.DumpType, "*.txt.xz"))
tarfile, _ := filepath.Glob(filepath.Join(archivesPath, fullname, item.DumpType, "*.txt.xz"))
_, err := exec.Command("xz", "--decompress", tarfile[0]).Output()
if err != nil {
fmt.Println(err)
panic(err)
crash("Error decompressing the downloaded archives", err)
}
}

color.Cyan("Removing Zip Files..")
info("Removing Zip Files..")
for _, item := range dumpFiles.File {
_ = os.Remove(filepath.Join("archives", fullname, item.Name))
_ = os.Remove(filepath.Join(archivesPath, fullname, item.Name))
}
}
fileBytes, err := ioutil.ReadFile(keywordFile)
Expand All @@ -190,17 +219,26 @@ func getArchive(body []byte, date string, keywordFile string, outfile string) {
continue
}
for _, item := range dumpFiles.File {
dump_path, _ := filepath.Glob(filepath.Join("archives", fullname, item.DumpType, "*.txt"))
dump_path, _ := filepath.Glob(filepath.Join(archivesPath, fullname, item.DumpType, "*.txt"))
searchFile(dump_path[0], keywordSlice[i], outfile)
}
}

}

func searchFile(fileLocation string, keyword string, outfile string) {
path_parts := strings.Split(fileLocation, string(os.PathSeparator))
path := filepath.Join(path_parts[1], path_parts[2])
fmt.Println("Searching: " + keyword + " in: " + path)

var path string

if strings.HasPrefix(fileLocation, "archives"){
path_parts := strings.Split(fileLocation, string(os.PathSeparator))
path = filepath.Join(path_parts[1], path_parts[2])
} else {
path = fileLocation
}

info("Searching: \"" + keyword + "\" in " + path)

f, err := os.Open(fileLocation)
scanner := bufio.NewScanner(f)
if err != nil {
Expand All @@ -216,7 +254,7 @@ func searchFile(fileLocation string, keyword string, outfile string) {
regexValue := strings.Split(keyword, " ")[1]
r, err := regexp.Compile(regexValue)
if err != nil {
color.Red("Invalid Regex!")
warning("Invalid Regex!")
return
}
for scanner.Scan() {
Expand Down Expand Up @@ -265,7 +303,7 @@ func searchFile(fileLocation string, keyword string, outfile string) {
func ifArchiveExists(fullname string) bool {
dumpFiles := archiveMetadata(fullname)
for _, item := range dumpFiles.File {
archiveFilepaths, err := filepath.Glob(filepath.Join("archives", fullname, item.DumpType, "*.txt"))
archiveFilepaths, err := filepath.Glob(filepath.Join(archivesPath, fullname, item.DumpType, "*.txt"))
if len(archiveFilepaths) == 0 || err != nil {
return false
}
Expand All @@ -275,12 +313,12 @@ func ifArchiveExists(fullname string) bool {

func archiveMetadata(fullname string) Files {
metadataFilename := "urlteam_" + strings.Split(fullname, "_")[1] + "_files.xml"
if !fileExists(filepath.Join("archives", fullname, metadataFilename)) {
color.Red(metadataFilename + " doesn't exists locally.")
if !fileExists(filepath.Join(archivesPath, fullname, metadataFilename)) {
info(metadataFilename + " doesn't exist locally. The file will be downloaded.")
metadataUrl := "https://archive.org/download/" + fullname + "/" + metadataFilename
downloadFile(metadataUrl)
}
byteValue, _ := ioutil.ReadFile(filepath.Join("archives", fullname, metadataFilename))
byteValue, _ := ioutil.ReadFile(filepath.Join(archivesPath, fullname, metadataFilename))
files := Files{}
xml.Unmarshal(byteValue, &files)
// Not all files are dumps, this struct will only contain zip dumps
Expand All @@ -305,8 +343,8 @@ func fileExists(filename string) bool {
func downloadFile(url string) {
dirname := strings.Split(url, "/")[4]
filename := strings.Split(url, "/")[5]
fmt.Println("Downloading: " + url)
_ = os.MkdirAll(filepath.Join("archives", dirname), os.ModePerm)
info("Downloading: " + url)
_ = os.MkdirAll(filepath.Join(archivesPath, dirname), os.ModePerm)
req, err := http.NewRequest("GET", url, nil)
if err != nil {
panic(err)
Expand All @@ -316,7 +354,7 @@ func downloadFile(url string) {
panic(err)
}
defer resp.Body.Close()
f, err := os.OpenFile(filepath.Join("archives", dirname, filename), os.O_CREATE|os.O_WRONLY, 0644)
f, err := os.OpenFile(filepath.Join(archivesPath, dirname, filename), os.O_CREATE|os.O_WRONLY, 0644)
if err != nil {
panic(err)
}
Expand Down Expand Up @@ -397,3 +435,16 @@ func rangeDate(start, end time.Time) func() time.Time {
return date
}
}

func info(message string) {
fmt.Println("[+]: " + message)
}

func crash(message string, err error) {
color.Red("[ERROR]: " + message + "\n")
panic(err)
}

func warning(message string) {
color.Yellow("[WARNING]: " + message + "\n")
}

0 comments on commit b988e80

Please sign in to comment.