Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Include annexed files in zip and tar.gz downloads #43

Open
wants to merge 11 commits into
base: git-annex
Choose a base branch
from
12 changes: 8 additions & 4 deletions .github/workflows/pull-db-tests.yml
Original file line number Diff line number Diff line change
@@ -46,10 +46,11 @@ jobs:
- name: Add hosts to /etc/hosts
run: '[ -e "/.dockerenv" ] || [ -e "/run/.containerenv" ] || echo "127.0.0.1 pgsql ldap minio" | sudo tee -a /etc/hosts'
- run: make deps-backend
- run: sudo apt update && sudo DEBIAN_FRONTEND=noninteractive apt install -y git-annex
- run: make backend
env:
TAGS: bindata
- run: make test-pgsql-migration test-pgsql
- run: make test-pgsql-migration test-pgsql#TestGitAnnex
timeout-minutes: 50
env:
TAGS: bindata gogit
@@ -69,10 +70,11 @@ jobs:
go-version-file: go.mod
check-latest: true
- run: make deps-backend
- run: sudo apt update && sudo DEBIAN_FRONTEND=noninteractive apt install -y git-annex
- run: make backend
env:
TAGS: bindata gogit sqlite sqlite_unlock_notify
- run: make test-sqlite-migration test-sqlite
- run: make test-sqlite-migration test-sqlite#TestGitAnnex
timeout-minutes: 50
env:
TAGS: bindata gogit sqlite sqlite_unlock_notify
@@ -172,11 +174,12 @@ jobs:
- name: Add hosts to /etc/hosts
run: '[ -e "/.dockerenv" ] || [ -e "/run/.containerenv" ] || echo "127.0.0.1 mysql elasticsearch smtpimap" | sudo tee -a /etc/hosts'
- run: make deps-backend
- run: sudo apt update && sudo DEBIAN_FRONTEND=noninteractive apt install -y git-annex
- run: make backend
env:
TAGS: bindata
- name: run tests
run: make test-mysql-migration integration-test-coverage
run: make test-mysql-migration test-mysql#TestGitAnnex
env:
TAGS: bindata
RACE_ENABLED: true
@@ -205,10 +208,11 @@ jobs:
- name: Add hosts to /etc/hosts
run: '[ -e "/.dockerenv" ] || [ -e "/run/.containerenv" ] || echo "127.0.0.1 mssql" | sudo tee -a /etc/hosts'
- run: make deps-backend
- run: sudo apt update && sudo DEBIAN_FRONTEND=noninteractive apt install -y git-annex
- run: make backend
env:
TAGS: bindata
- run: make test-mssql-migration test-mssql
- run: make test-mssql-migration test-mssql#TestGitAnnex
timeout-minutes: 50
env:
TAGS: bindata
7 changes: 4 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
@@ -8,7 +8,7 @@ self := $(location)
@tmpdir=`mktemp --tmpdir -d` ; \
echo Using temporary directory $$tmpdir for test repositories ; \
USE_REPO_TEST_DIR= $(MAKE) -f $(self) --no-print-directory REPO_TEST_DIR=$$tmpdir/ $@ ; \
STATUS=$$? ; rm -r "$$tmpdir" ; exit $$STATUS
STATUS=$$? ; chmod -R +w "$$tmpdir" && rm -r "$$tmpdir" ; exit $$STATUS

else

@@ -114,8 +114,9 @@ LDFLAGS := $(LDFLAGS) -X "main.MakeVersion=$(MAKE_VERSION)" -X "main.Version=$(G
LINUX_ARCHS ?= linux/amd64,linux/386,linux/arm-5,linux/arm-6,linux/arm64

GO_PACKAGES ?= $(filter-out code.gitea.io/gitea/tests/integration/migration-test code.gitea.io/gitea/tests code.gitea.io/gitea/tests/integration code.gitea.io/gitea/tests/e2e,$(shell $(GO) list ./... | grep -v /vendor/))
GO_TEST_PACKAGES ?= $(filter-out $(shell $(GO) list code.gitea.io/gitea/models/migrations/...) code.gitea.io/gitea/tests/integration/migration-test code.gitea.io/gitea/tests code.gitea.io/gitea/tests/integration code.gitea.io/gitea/tests/e2e,$(shell $(GO) list ./... | grep -v /vendor/))

# Only test code modified in the git-annex feature branch; upstream can handle testing the full suite.
# This list was generated by `git diff --stat --name-only main.. -- '*.go' | xargs dirname | sort | uniq`
GO_TEST_PACKAGES ?= code.gitea.io/gitea/modules/annex code.gitea.io/gitea/modules/base code.gitea.io/gitea/modules/git code.gitea.io/gitea/modules/private code.gitea.io/gitea/modules/setting code.gitea.io/gitea/modules/util code.gitea.io/gitea/routers/private code.gitea.io/gitea/routers/web code.gitea.io/gitea/services/auth
FOMANTIC_WORK_DIR := web_src/fomantic

WEBPACK_SOURCES := $(shell find web_src/js web_src/css -type f)
72 changes: 67 additions & 5 deletions cmd/serv.go
Original file line number Diff line number Diff line change
@@ -37,6 +37,7 @@ import (

const (
lfsAuthenticateVerb = "git-lfs-authenticate"
gitAnnexShellVerb = "git-annex-shell"
)

// CmdServ represents the available serv sub-command.
@@ -89,6 +90,7 @@ var (
"git-upload-archive": perm.AccessModeRead,
"git-receive-pack": perm.AccessModeWrite,
lfsAuthenticateVerb: perm.AccessModeNone,
gitAnnexShellVerb: perm.AccessModeNone, // annex permissions are enforced by GIT_ANNEX_SHELL_READONLY, rather than the Gitea API
}
alphaDashDotPattern = regexp.MustCompile(`[^\w-\.]`)
)
@@ -201,6 +203,7 @@ func runServ(c *cli.Context) error {

verb := words[0]
repoPath := words[1]

if repoPath[0] == '/' {
repoPath = repoPath[1:]
}
@@ -216,9 +219,43 @@ func runServ(c *cli.Context) error {
}
}

if verb == gitAnnexShellVerb {
if !setting.Annex.Enabled {
return fail(ctx, "Unknown git command", "git-annex request over SSH denied, git-annex support is disabled")
}

if len(words) < 3 {
return fail(ctx, "Too few arguments", "Too few arguments in cmd: %s", cmd)
}

// git-annex always puts the repo in words[2], unlike most other
// git subcommands; and it sometimes names repos like /~/, as if
// $HOME should get expanded while also being rooted. e.g.:
// git-annex-shell 'configlist' '/~/user/repo'
// git-annex-shell 'sendkey' '/user/repo 'key'
repoPath = words[2]
repoPath = strings.TrimPrefix(repoPath, "/")
repoPath = strings.TrimPrefix(repoPath, "~/")
}

// LowerCase and trim the repoPath as that's how they are stored.
repoPath = strings.ToLower(strings.TrimSpace(repoPath))

// prevent directory traversal attacks
repoPath = filepath.Clean("/" + repoPath)[1:]

// put the sanitized repoPath back into the argument list for later
if verb == gitAnnexShellVerb {
// git-annex-shell demands an absolute path
absRepoPath, err := filepath.Abs(filepath.Join(setting.RepoRootPath, repoPath))
if err != nil {
return fail(ctx, "Error locating repoPath", "%v", err)
}
words[2] = absRepoPath
} else {
words[1] = repoPath
}

rr := strings.SplitN(repoPath, "/", 2)
if len(rr) != 2 {
return fail(ctx, "Invalid repository path", "Invalid repository path: %v", repoPath)
@@ -305,21 +342,46 @@ func runServ(c *cli.Context) error {
return nil
}

var gitcmd *exec.Cmd
gitBinPath := filepath.Dir(git.GitExecutable) // e.g. /usr/bin
gitBinVerb := filepath.Join(gitBinPath, verb) // e.g. /usr/bin/git-upload-pack
if _, err := os.Stat(gitBinVerb); err != nil {
// if the command "git-upload-pack" doesn't exist, try to split "git-upload-pack" to use the sub-command with git
// ps: Windows only has "git.exe" in the bin path, so Windows always uses this way
// ps: git-annex-shell and other extensions may not necessarily be in gitBinPath,
// but '{gitBinPath}/git annex-shell' should be able to find them on $PATH.
verbFields := strings.SplitN(verb, "-", 2)
if len(verbFields) == 2 {
// use git binary with the sub-command part: "C:\...\bin\git.exe", "upload-pack", ...
gitcmd = exec.CommandContext(ctx, git.GitExecutable, verbFields[1], repoPath)
gitBinVerb = git.GitExecutable
words = append([]string{verbFields[1]}, words...)
}
}
if gitcmd == nil {
// by default, use the verb (it has been checked above by allowedCommands)
gitcmd = exec.CommandContext(ctx, gitBinVerb, repoPath)

// by default, use the verb (it has been checked above by allowedCommands)
gitcmd := exec.CommandContext(ctx, gitBinVerb, words[1:]...)

if verb == gitAnnexShellVerb {
// This doesn't get its own isolated section like LFS does, because LFS
// is handled by internal Gitea routines, but git-annex has to be shelled out
// to like other git subcommands, so we need to build up gitcmd.

// TODO: does this work on Windows?
gitcmd.Env = append(gitcmd.Env,
// "If set, disallows running git-shell to handle unknown commands."
// - git-annex-shell(1)
"GIT_ANNEX_SHELL_LIMITED=True",
// "If set, git-annex-shell will refuse to run commands
// that do not operate on the specified directory."
// - git-annex-shell(1)
fmt.Sprintf("GIT_ANNEX_SHELL_DIRECTORY=%s", words[2]),
)
if results.UserMode < perm.AccessModeWrite {
// "If set, disallows any action that could modify the git-annex repository."
// - git-annex-shell(1)
// We set this when the backend API has told us that we don't have write permission to this repo.
log.Debug("Setting GIT_ANNEX_SHELL_READONLY=True")
gitcmd.Env = append(gitcmd.Env, "GIT_ANNEX_SHELL_READONLY=True")
}
}

process.SetSysProcAttribute(gitcmd)
4 changes: 4 additions & 0 deletions cmd/web.go
Original file line number Diff line number Diff line change
@@ -311,6 +311,10 @@ func listen(m http.Handler, handleRedirector bool) error {
log.Info("LFS server enabled")
}

if setting.Annex.Enabled {
log.Info("git-annex enabled")
}

var err error
switch setting.Protocol {
case setting.HTTP:
9 changes: 9 additions & 0 deletions custom/conf/app.example.ini
Original file line number Diff line number Diff line change
@@ -2517,6 +2517,15 @@ LEVEL = Info
;; override the minio base path if storage type is minio
;MINIO_BASE_PATH = lfs/

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;[annex]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;
;; Whether git-annex is enabled; defaults to false
;ENABLED = false

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; settings for packages, will override storage setting
160 changes: 160 additions & 0 deletions modules/annex/annex.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
// Copyright 2022 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT

// Unlike modules/lfs, which operates mainly on git.Blobs, this operates on git.TreeEntrys.
// The motivation for this is that TreeEntrys have an easy pointer to the on-disk repo path,
// while blobs do not (in fact, if building with TAGS=gogit, blobs might exist only in a mock
// filesystem, living only in process RAM). We must have the on-disk path to do anything
// useful with git-annex because all of its interesting data is on-disk under .git/annex/.

package annex

import (
"errors"
"fmt"
"os"
"path"
"strings"

"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/util"
)

const (
// > The maximum size of a pointer file is 32 kb.
// - https://git-annex.branchable.com/internals/pointer_file/
// It's unclear if that's kilobytes or kibibytes; assuming kibibytes:
blobSizeCutoff = 32 * 1024
)

// ErrInvalidPointer occurs if the pointer's value doesn't parse
var ErrInvalidPointer = errors.New("Not a git-annex pointer")

// Gets the content of the blob as raw text, up to n bytes.
// (the pre-existing blob.GetBlobContent() has a hardcoded 1024-byte limit)
func getBlobContent(b *git.Blob, n int) (string, error) {
dataRc, err := b.DataAsync()
if err != nil {
return "", err
}
defer dataRc.Close()
buf := make([]byte, n)
n, _ = util.ReadAtMost(dataRc, buf)
buf = buf[:n]
return string(buf), nil
}

func Pointer(blob *git.Blob) (string, error) {
// git-annex doesn't seem fully spec what its pointer are, but
// the fullest description is here:
// https://git-annex.branchable.com/internals/pointer_file/

// a pointer can be:
// the original format, generated by `git annex add`: a symlink to '.git/annex/objects/$HASHDIR/$HASHDIR2/$KEY/$KEY'
// the newer, git-lfs influenced, format, generated by `git annex smudge`: a text file containing '/annex/objects/$KEY'
//
// in either case we can extract the $KEY the same way, and we need not actually know if it's a symlink or not because
// git.Blob.DataAsync() works like open() + readlink(), handling both cases in one.

if blob.Size() > blobSizeCutoff {
// > The maximum size of a pointer file is 32 kb. If it is any longer, it is not considered to be a valid pointer file.
// https://git-annex.branchable.com/internals/pointer_file/

// It's unclear to me whether the same size limit applies to symlink-pointers, but it seems sensible to limit them too.
return "", ErrInvalidPointer
}

pointer, err := getBlobContent(blob, blobSizeCutoff)
if err != nil {
return "", fmt.Errorf("error reading %s: %w", blob.Name(), err)
}

// the spec says a pointer file can contain multiple lines each with a pointer in them
// but that makes no sense to me, so I'm just ignoring all but the first
lines := strings.Split(pointer, "\n")
if len(lines) < 1 {
return "", ErrInvalidPointer
}
pointer = lines[0]

// in both the symlink and pointer-file formats, the pointer must have "/annex/" somewhere in it
if !strings.Contains(pointer, "/annex/") {
return "", ErrInvalidPointer
}

// extract $KEY
pointer = path.Base(strings.TrimSpace(pointer))

// ask git-annex's opinion on $KEY
// XXX: this is probably a bit slow, especially if this operation gets run often
// and examinekey is not that strict:
// - it doesn't enforce that the "BACKEND" tag is one it knows,
// - it doesn't enforce that the fields and their format fit the "BACKEND" tag
// so maybe this is a wasteful step
_, examineStderr, err := git.NewCommandContextNoGlobals(git.DefaultContext, "annex", "examinekey").AddDynamicArguments(pointer).RunStdString(&git.RunOpts{Dir: blob.Repo().Path})
if err != nil {
// TODO: make ErrInvalidPointer into a type capable of wrapping err
if strings.TrimSpace(examineStderr) == "git-annex: bad key" {
return "", ErrInvalidPointer
}
return "", err
}

return pointer, nil
}

// return the absolute path of the content pointed to by the annex pointer stored in the git object
// errors if the content is not found in this repo
func ContentLocation(blob *git.Blob) (string, error) {
pointer, err := Pointer(blob)
if err != nil {
return "", err
}

contentLocation, _, err := git.NewCommandContextNoGlobals(git.DefaultContext, "annex", "contentlocation").AddDynamicArguments(pointer).RunStdString(&git.RunOpts{Dir: blob.Repo().Path})
if err != nil {
return "", fmt.Errorf("in %s: %s does not seem to be a valid annexed file: %w", blob.Repo().Path, pointer, err)
}
contentLocation = strings.TrimSpace(contentLocation)
contentLocation = path.Clean("/" + contentLocation)[1:] // prevent directory traversals
contentLocation = path.Join(blob.Repo().Path, contentLocation)

return contentLocation, nil
}

// returns a stream open to the annex content
func Content(blob *git.Blob) (*os.File, error) {
contentLocation, err := ContentLocation(blob)
if err != nil {
return nil, err
}

return os.Open(contentLocation)
}

// whether the object appears to be a valid annex pointer
// does *not* verify if the content is actually in this repo;
// for that, use ContentLocation()
func IsAnnexed(blob *git.Blob) (bool, error) {
if !setting.Annex.Enabled {
return false, nil
}

// Pointer() is written to only return well-formed pointers
// so the test is just to see if it errors
_, err := Pointer(blob)
if err != nil {
if errors.Is(err, ErrInvalidPointer) {
return false, nil
}
return false, err
}
return true, nil
}

// IsAnnexRepo determines if repo is a git-annex enabled repository
func IsAnnexRepo(repo *git.Repository) bool {
_, _, err := git.NewCommand(repo.Ctx, "annex", "info").RunStdString(&git.RunOpts{Dir: repo.Path})
return err == nil
}
Loading