Skip to content

Commit

Permalink
Merge pull request #6 from ipfs-shipyard/feat/use-dht-crawler
Browse files Browse the repository at this point in the history
DHT Crawler
  • Loading branch information
willscott authored Jan 28, 2021
2 parents 6c30019 + 4bc3ab5 commit 9292515
Show file tree
Hide file tree
Showing 16 changed files with 1,526 additions and 503 deletions.
14 changes: 0 additions & 14 deletions .circleci/config.yml

This file was deleted.

39 changes: 39 additions & 0 deletions .github/workflows/go.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
name: Go

on:
push:
branches: [ master ]
pull_request:
branches: [ master ]

jobs:

build:
name: Build
runs-on: ubuntu-latest
steps:

- name: Set up Go 1.x
uses: actions/setup-go@v2
with:
go-version: ^1.15
id: go

- name: Check out code into the Go module directory
uses: actions/checkout@v2

- name: Get dependencies
run: go get -v -t -d ./...

- name: Build
run: CGO_ENABLED=0 go build -v ./...

- name: Run golangci-lint
uses: actions-contrib/golangci-lint@v1
env:
GOROOT: ""
with:
args: "run"

- name: Test
run: go test -v .
21 changes: 21 additions & 0 deletions .github/workflows/package.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
name: Publish Docker image
on:
release:
types: [published]
jobs:
push_to_registry:
name: Push Docker image to GitHub Packages
runs-on: ubuntu-latest
steps:
- name: Check out the repo
uses: actions/checkout@v2
- name: Push to GitHub Packages
uses: docker/build-push-action@v1
with:
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
registry: docker.pkg.github.com
repository: ipfs-shipyard/ipfs-counter/ipfs-counter
tags: latest
tag_with_ref: true
add_git_labels: true
17 changes: 17 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Binaries for programs and plugins
*.exe
*.exe~
*.dll
*.so
*.dylib

# Test binary, built with `go test -c`
*.test

# Output of the go coverage tool, specifically when used with LiteIDE
*.out

# Dependency directories (remove the comment below to include it)
# vendor/

*.json
40 changes: 40 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
FROM golang:alpine AS builder

# Git is required for fetching the dependencies.
# Ca-certificates is required to call HTTPS endpoints.
RUN apk update && apk add --no-cache git ca-certificates && update-ca-certificates

# Create appuser
ENV USER=appuser
ENV UID=10001

# See https://stackoverflow.com/a/55757473/12429735RUN
RUN adduser \
--disabled-password \
--gecos "" \
--home "/nonexistent" \
--shell "/sbin/nologin" \
--no-create-home \
--uid "${UID}" \
"${USER}"

WORKDIR $GOPATH/src/github.com/ipfs-shipyard/ipfs-counter/

COPY . .
RUN go mod download
RUN go mod verify

RUN GOOS=linux GOARCH=amd64 CGO_ENABLED=0 go build -ldflags="-w -s" -o /go/bin/ipfs-counter

# STEP 2 build a small image
FROM scratch

COPY --from=builder /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/
COPY --from=builder /etc/passwd /etc/passwd
COPY --from=builder /etc/group /etc/group

COPY --from=builder /go/bin/ipfs-counter /go/bin/ipfs-counter

USER appuser:appuser

ENTRYPOINT ["/go/bin/ipfs-counter"]
21 changes: 21 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
MIT License

Copyright (c) 2020 Adin Schmahmann

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
8 changes: 2 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,11 @@ Crawls the IPFS DHT to give some metrics. Note: this project is a WIP and result

`go build`

### Usage

Running the application output prometheus metrics on the path `/metrics` with port 1234. Set the environment variable `IPFS_METRICS_PASSWORD` to control access to the prometheus metrics.

Network data is output into the `netdata` folder via levelDB

## Lead Maintainer
## Lead Maintainers

[Adin Schmahmann](https://github.com/aschmahmann)
[Will Scott](https://github.com/willscott)

## Contributing

Expand Down
216 changes: 216 additions & 0 deletions crawl.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,216 @@
package main

import (
"context"
"time"

logging "github.com/ipfs/go-log"
"github.com/urfave/cli/v2"

"github.com/libp2p/go-libp2p"
"github.com/libp2p/go-libp2p-core/crypto"
"github.com/libp2p/go-libp2p-core/host"
"github.com/libp2p/go-libp2p-core/peer"
noise "github.com/libp2p/go-libp2p-noise"
quic "github.com/libp2p/go-libp2p-quic-transport"
secio "github.com/libp2p/go-libp2p-secio"
tls "github.com/libp2p/go-libp2p-tls"

"github.com/libp2p/go-libp2p-kad-dht/crawler"
"github.com/multiformats/go-multiaddr"
)

var crawlFlags = []cli.Flag{
&cli.StringFlag{
Name: "output",
TakesFile: true,
Usage: "Output file location",
Value: "crawl-output",
},
&cli.StringFlag{
Name: "dataset",
Usage: "Google biquery dataset ID for insertion",
},
&cli.StringFlag{
Name: "table",
Usage: "Google bigquery table prefix for insertion",
},
&cli.BoolFlag{
Name: "create-tables",
Usage: "To create bigquery tables if they do not exist",
},
&cli.StringFlag{
Name: "seed-file",
TakesFile: true,
Usage: "Use peers from a file to seed crawling",
},
&cli.StringFlag{
Name: "seed-table",
Usage: "Use peers / multiaddrs from previous trial table to seed crawling",
},
&cli.DurationFlag{
Name: "seed-table-duration",
Usage: "when seeding from table, select date range for querying hosts",
Value: 7 * 24 * time.Hour,
},
&cli.IntFlag{
Name: "parallelism",
Usage: "How many connections to open at once",
Value: 1000,
},
&cli.DurationFlag{
Name: "timeout",
Usage: "How long to wait on dial attempts",
Value: 5 * time.Second,
},
&cli.DurationFlag{
Name: "crawltime",
Usage: "How long to crawl for",
Value: 20 * time.Hour,
},
&cli.BoolFlag{
Name: "debug",
Usage: "Print debugging messages",
},
}

func must(m multiaddr.Multiaddr, e error) multiaddr.Multiaddr {
if e != nil {
panic(e)
}
return m
}

var bootstrapAddrs = []multiaddr.Multiaddr{
must(multiaddr.NewMultiaddr("/ip4/139.178.89.189/tcp/4001/p2p/QmZa1sAxajnQjVM8WjWXoMbmPd7NsWhfKsPkErzpm9wGkp")),
must(multiaddr.NewMultiaddr("/ip4/104.131.131.82/tcp/4001/p2p/QmaCpDMGvV2BGHeYERUEnRQAwe3N8SzbUtfsmvsqQLuvuJ")),
must(multiaddr.NewMultiaddr("/ip4/207.148.19.196/tcp/20074/p2p/12D3KooWGXBbSZ3ko3UvoekdnnSrdmuFic3XHuNKvGcZyrH1mVxr")),
must(multiaddr.NewMultiaddr("/ip4/18.185.241.99/tcp/20001/p2p/12D3KooWA4NVc1GytssyhxGqaT22kJ9XwdhCpS2VwNPPMw59Ctf4")),
must(multiaddr.NewMultiaddr("/ip4/64.225.116.25/tcp/30017/p2p/12D3KooWHHVPRYiXuWsVmATm8nduX7dXXpw3kC5Co1QSUYVLNXZN")),

must(multiaddr.NewMultiaddr("/ip4/104.131.131.82/tcp/4001/ipfs/QmaCpDMGvV2BGHeYERUEnRQAwe3N8SzbUtfsmvsqQLuvuJ")), // mars.i.ipfs.io
must(multiaddr.NewMultiaddr("/ip4/104.236.179.241/tcp/4001/ipfs/QmSoLPppuBtQSGwKDZT2M73ULpjvfd3aZ6ha4oFGL1KrGM")), // pluto.i.ipfs.io
must(multiaddr.NewMultiaddr("/ip4/128.199.219.111/tcp/4001/ipfs/QmSoLSafTMBsPKadTEgaXctDQVcqN88CNLHXMkTNwMKPnu")), // saturn.i.ipfs.io
must(multiaddr.NewMultiaddr("/ip4/104.236.76.40/tcp/4001/ipfs/QmSoLV4Bbm51jM9C4gDYZQ9Cy3U6aXMJDAbzgu2fzaDs64")), // venus.i.ipfs.io
must(multiaddr.NewMultiaddr("/ip4/178.62.158.247/tcp/4001/ipfs/QmSoLer265NRgSp2LA3dPaeykiS1J6DifTC88f5uVQKNAd")), // earth.i.ipfs.io
must(multiaddr.NewMultiaddr("/ip4/104.236.151.122/tcp/4001/ipfs/QmSoLju6m7xTh3DuokvT3886QRYqxAzb1kShaanJgW36yx")),
must(multiaddr.NewMultiaddr("/ip4/188.40.114.11/tcp/4001/ipfs/QmZY7MtK8ZbG1suwrxc7xEYZ2hQLf1dAWPRHhjxC8rjq8E")),
must(multiaddr.NewMultiaddr("/ip4/5.9.59.34/tcp/4001/ipfs/QmRv1GNseNP1krEwHDjaQMeQVJy41879QcDwpJVhY8SWve")),
}

func makeHost(c *cli.Context, r *Recorder) (host.Host, error) {
crypto.MinRsaKeyBits = 512

h, err := libp2p.New(c.Context,
libp2p.ConnectionGater(r),
libp2p.ListenAddrStrings("/ip4/0.0.0.0/tcp/4001"),
libp2p.Transport(quic.NewTransport),
libp2p.DefaultTransports,
// libp2p.Transport(tcp.NewTCPTransport),
// libp2p.Transport(ws.New),
libp2p.Security(tls.ID, tls.New),
libp2p.Security(noise.ID, noise.New),
libp2p.Security(secio.ID, secio.New),
)
if err != nil {
return nil, err
}
if err := r.setHost(h); err != nil {
return nil, err
}

return h, nil
}

func crawl(c *cli.Context) error {
ll := "info"
if c.Bool("debug") {
ll = "debug"
}
logger := logging.Logger("dht-crawler")
if err := logging.SetLogLevel("dht-crawler", ll); err != nil {
return err
}

ctx := c.Context

r, err := NewRecorder(c)
if err != nil {
return err
}

host, err := makeHost(c, r)
if err != nil {
return err
}

pending := newMAList()

if c.IsSet("seed-file") {
ok, err := pending.AddFile(c.String("seed-file"))
if !ok {
return err
} else if err != nil {
logger.Warnf("Some multiaddrs could not be parsed: %v", err)
}
} else if c.IsSet("seed-table") {
addrs, err := r.getMultiAddrs(ctx, c.String("dataset"), c.String("seed-table"), c.Duration("seed-table-duration"))
if err != nil {
return err
}
ok, err := pending.AddStrings(addrs)
if !ok {
return err
} else if err != nil {
logger.Warnf("Some multiaddrs could not be parsed: %v", err)
}
}

for _, ma := range bootstrapAddrs {
if err := pending.Add(ma); err != nil {
logger.Warnf("Unable to parse address %s: %w", ma, err)
continue
}
}
logger.Infof("Seeding crawl with %d peer addresses", len(pending))

// populate host info
peers := make([]*peer.AddrInfo, 0, len(pending))
for _, p := range pending {
pis, err := peer.AddrInfosFromP2pAddrs(p.Addrs...)
if err != nil {
logger.Warnf("Failed to parse addresses for %s: %w", p.ID, err)
continue
}
for _, pi := range pis {
peers = append(peers, &pi)
}

nonIDAddrs := make([]multiaddr.Multiaddr, 0, len(p.Addrs))
// Remove the /p2p/<id> portion of the addresses.
for _, a := range p.Addrs {
na, _ := multiaddr.SplitFunc(a, func(c multiaddr.Component) bool {
return c.Protocol().Code == multiaddr.P_P2P
})
nonIDAddrs = append(nonIDAddrs, na)
}
host.Peerstore().AddAddrs(p.ID, nonIDAddrs, time.Hour)
}

crawl, err := crawler.New(host,
crawler.WithParallelism(c.Int("parallelism")),
crawler.WithMsgTimeout(c.Duration("timeout")))
if err != nil {
panic(err)
}

// TODO: configure timeout.
short, c2 := context.WithTimeout(ctx, c.Duration("crawltime"))
defer c2()
crawl.Run(short, peers,
r.onPeerSuccess,
r.onPeerFailure)

logger.Info("Crawl complete. Collecting Output...")
return Output(c.String("output"), r)
}
Loading

0 comments on commit 9292515

Please sign in to comment.