srpmproc/pkg/modes/git.go

// Copyright (c) 2021 The Srpmproc Authors
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.

package modes

import (
	"bytes"
	"fmt"
	"io"
	"log"
	"net/http"
	"path/filepath"
	"sort"
	"strings"
	"text/template"
	"time"

	"github.com/go-git/go-git/v5/plumbing/transport"
	"github.com/rocky-linux/srpmproc/pkg/misc"

	"github.com/go-git/go-billy/v5/memfs"
	"github.com/go-git/go-git/v5"
	"github.com/go-git/go-git/v5/config"
	"github.com/go-git/go-git/v5/plumbing"
	"github.com/go-git/go-git/v5/plumbing/object"
	"github.com/go-git/go-git/v5/storage/memory"
	"github.com/rocky-linux/srpmproc/pkg/data"
)

type remoteTarget struct {
	remote string
	when   time.Time
}

// Struct to define the possible template values ( {{.Value}} in CDN URL strings:
type Lookaside struct {
	Name     string
	Branch   string
	Hash     string
	Hashtype string
	Filename string
}

type remoteTargetSlice []remoteTarget

func (p remoteTargetSlice) Len() int {
	return len(p)
}

func (p remoteTargetSlice) Less(i, j int) bool {
	return p[i].when.Before(p[j].when)
}

func (p remoteTargetSlice) Swap(i, j int) {
	p[i], p[j] = p[j], p[i]
}

type GitMode struct{}

func (g *GitMode) RetrieveSource(pd *data.ProcessData) (*data.ModeData, error) {
	repo, err := git.Init(memory.NewStorage(), memfs.New())
	if err != nil {
		return nil, fmt.Errorf("could not init git Repo: %v", err)
	}

	w, err := repo.Worktree()
	if err != nil {
		return nil, fmt.Errorf("could not get Worktree: %v", err)
	}

	refspec := config.RefSpec("+refs/heads/*:refs/remotes/*")
	remote, err := repo.CreateRemote(&config.RemoteConfig{
		Name:  "upstream",
		URLs:  []string{fmt.Sprintf("%s.git", pd.RpmLocation)},
		Fetch: []config.RefSpec{refspec},
	})
	if err != nil {
		return nil, fmt.Errorf("could not create remote: %v", err)
	}

	fetchOpts := &git.FetchOptions{
		Auth:     pd.Authenticator,
		RefSpecs: []config.RefSpec{refspec},
		Tags:     git.AllTags,
		Force:    true,
	}

	err = remote.Fetch(fetchOpts)
	if err != nil {
		if err == transport.ErrInvalidAuthMethod || err == transport.ErrAuthenticationRequired {
			fetchOpts.Auth = nil
			err = remote.Fetch(fetchOpts)
			if err != nil {
				return nil, fmt.Errorf("could not fetch upstream: %v", err)
			}
		} else {
			return nil, fmt.Errorf("could not fetch upstream: %v", err)
		}
	}

	var branches remoteTargetSlice

	latestTags := map[string]*remoteTarget{}

	tagAdd := func(tag *object.Tag) error {
		if strings.HasPrefix(tag.Name, fmt.Sprintf("imports/%s%d", pd.ImportBranchPrefix, pd.Version)) {
			refSpec := fmt.Sprintf("refs/tags/%s", tag.Name)
			if misc.GetTagImportRegex(pd).MatchString(refSpec) {
				match := misc.GetTagImportRegex(pd).FindStringSubmatch(refSpec)

				exists := latestTags[match[2]]
				if exists != nil && exists.when.After(tag.Tagger.When) {
					return nil
				}
				latestTags[match[2]] = &remoteTarget{
					remote: refSpec,
					when:   tag.Tagger.When,
				}
			}
		}
		return nil
	}

	// In case of "tagless mode", we need to get the head ref of the branch instead
	// This is a kind of alternative implementation of the above tagAdd assignment
	refAdd := func(tag *object.Tag) error {
		if misc.TaglessRefOk(tag.Name, pd) {
			pd.Log.Printf("Tagless mode:  Identified tagless commit for import: %s\n", tag.Name)
			refSpec := fmt.Sprintf(tag.Name)

			// We split the string by "/", the branch name we're looking for to pass to latestTags is always last
			// (ex: "refs/heads/c9s" ---> we want latestTags[c9s]
			tmpRef := strings.Split(refSpec, "/")
			tmpBranchName := tmpRef[(len(tmpRef) - 1)]

			latestTags[tmpBranchName] = &remoteTarget{
				remote: refSpec,
				when:   tag.Tagger.When,
			}
		}
		return nil
	}

	tagIter, err := repo.TagObjects()
	if err != nil {
		return nil, fmt.Errorf("could not get tag objects: %v", err)
	}

	// tagless mode means we use "refAdd" (add commit by reference)
	// normal mode means we can rely on "tagAdd" (the tag should be present for us in the source repo)
	if pd.TaglessMode {
		_ = tagIter.ForEach(refAdd)
	} else {
		_ = tagIter.ForEach(tagAdd)
	}

	listOpts := &git.ListOptions{
		Auth: pd.Authenticator,
	}
	list, err := remote.List(listOpts)
	if err != nil {
		if err == transport.ErrInvalidAuthMethod || err == transport.ErrAuthenticationRequired {
			listOpts.Auth = nil
			list, err = remote.List(listOpts)
			if err != nil {
				return nil, fmt.Errorf("could not list upstream: %v", err)
			}
		} else {
			return nil, fmt.Errorf("could not list upstream: %v", err)
		}
	}

	for _, ref := range list {
		if ref.Hash().IsZero() {
			continue
		}

		commit, err := repo.CommitObject(ref.Hash())
		if err != nil {
			continue
		}

		// Call refAdd instead of tagAdd in the case of TaglessMode enabled
		if pd.TaglessMode {
			_ = refAdd(&object.Tag{
				Name:   string(ref.Name()),
				Tagger: commit.Committer,
			})
		} else {
			_ = tagAdd(&object.Tag{
				Name:   strings.TrimPrefix(string(ref.Name()), "refs/tags/"),
				Tagger: commit.Committer,
			})
		}

	}

	for _, branch := range latestTags {
		pd.Log.Printf("tag: %s", strings.TrimPrefix(branch.remote, "refs/tags/"))
		branches = append(branches, *branch)
	}
	sort.Sort(branches)

	var sortedBranches []string
	for _, branch := range branches {
		sortedBranches = append(sortedBranches, branch.remote)
	}

	return &data.ModeData{
		Name:       filepath.Base(pd.RpmLocation),
		Repo:       repo,
		Worktree:   w,
		FileWrites: nil,
		Branches:   sortedBranches,
	}, nil
}

func (g *GitMode) WriteSource(pd *data.ProcessData, md *data.ModeData) error {
	remote, err := md.Repo.Remote("upstream")

	if err != nil && !pd.TaglessMode {
		return fmt.Errorf("could not get upstream remote: %v", err)
	}

	var refspec config.RefSpec
	var branchName string

	// In the case of tagless mode, we already have the transformed repo sitting in the worktree,
	// and don't need to perform any checkout or fetch operations
	if !pd.TaglessMode {
		if strings.HasPrefix(md.TagBranch, "refs/heads") {
			refspec = config.RefSpec(fmt.Sprintf("+%s:%s", md.TagBranch, md.TagBranch))
			branchName = strings.TrimPrefix(md.TagBranch, "refs/heads/")
		} else {
			match := misc.GetTagImportRegex(pd).FindStringSubmatch(md.TagBranch)
			branchName = match[2]
			refspec = config.RefSpec(fmt.Sprintf("+refs/heads/%s:%s", branchName, md.TagBranch))
			fmt.Println("Found branchname that does not start w/ refs/heads :: ", branchName)
		}
		pd.Log.Printf("checking out upstream refspec %s", refspec)

		fetchOpts := &git.FetchOptions{
			Auth:       pd.Authenticator,
			RemoteName: "upstream",
			RefSpecs:   []config.RefSpec{refspec},
			Tags:       git.AllTags,
			Force:      true,
		}
		err = remote.Fetch(fetchOpts)
		if err != nil && err != git.NoErrAlreadyUpToDate {
			if err == transport.ErrInvalidAuthMethod || err == transport.ErrAuthenticationRequired {
				fetchOpts.Auth = nil
				err = remote.Fetch(fetchOpts)
				if err != nil && err != git.NoErrAlreadyUpToDate {
					return fmt.Errorf("could not fetch upstream: %v", err)
				}
			} else {
				return fmt.Errorf("could not fetch upstream: %v", err)
			}
		}

		err = md.Worktree.Checkout(&git.CheckoutOptions{
			Branch: plumbing.ReferenceName(md.TagBranch),
			Force:  true,
		})
		if err != nil {
			return fmt.Errorf("could not checkout source from git: %v", err)
		}

		_, err = md.Worktree.Add(".")
		if err != nil {
			return fmt.Errorf("could not add Worktree: %v", err)
		}
	}

	if pd.TaglessMode {
		branchName = fmt.Sprintf("%s%d%s", pd.ImportBranchPrefix, pd.Version, pd.BranchSuffix)
	}

	metadataPath := ""
	ls, err := md.Worktree.Filesystem.ReadDir(".")
	if err != nil {
		return fmt.Errorf("could not read directory: %v", err)
	}
	for _, f := range ls {
		if strings.HasSuffix(f.Name(), ".metadata") {
			if metadataPath != "" {
				return fmt.Errorf("multiple metadata files found")
			}
			metadataPath = f.Name()
		}
	}
	if metadataPath == "" {
		metadataPath = fmt.Sprintf(".%s.metadata", md.Name)
	}

	metadataFile, err := md.Worktree.Filesystem.Open(metadataPath)
	if err != nil {
		pd.Log.Printf("warn: could not open metadata file, so skipping: %v", err)
		return nil
	}

	fileBytes, err := io.ReadAll(metadataFile)
	if err != nil {
		return fmt.Errorf("could not read metadata file: %v", err)
	}

	client := &http.Client{
		Transport: &http.Transport{
			DisableCompression: false,
		},
	}
	fileContent := strings.Split(string(fileBytes), "\n")
	for _, line := range fileContent {
		if strings.TrimSpace(line) == "" {
			continue
		}

		lineInfo := strings.SplitN(line, " ", 2)
		hash := strings.TrimSpace(lineInfo[0])
		path := strings.TrimSpace(lineInfo[1])

		var body []byte

		if md.BlobCache[hash] != nil {
			body = md.BlobCache[hash]
			pd.Log.Printf("retrieving %s from cache", hash)
		} else {
			fromBlobStorage, err := pd.BlobStorage.Read(hash)
			if err != nil {
				return err
			}
			if fromBlobStorage != nil && !pd.NoStorageDownload {
				body = fromBlobStorage
				pd.Log.Printf("downloading %s from blob storage", hash)
			} else {

				url := ""

				// We need to figure out the hashtype for templating purposes:
				hashType := "sha512"
				switch len(hash) {
				case 128:
					hashType = "sha512"
				case 64:
					hashType = "sha256"
				case 40:
					hashType = "sha1"
				case 32:
					hashType = "md5"
				}

				// need the name of the file without "SOURCES/":
				fileName := strings.Split(path, "/")[1]

				// Feed our template info to ProcessUrl and transform to the real values: ( {{.Name}}, {{.Branch}}, {{.Hash}}, {{.Hashtype}}, {{.Filename}} )
				url, hasTemplate := ProcessUrl(pd.CdnUrl, md.Name, branchName, hash, hashType, fileName)

				var req *http.Request
				var resp *http.Response

				// Download the --cdn-url given, but *only* if it contains template strings ( {{.Name}} , {{.Hash}} , etc. )
				// Otherwise we need to fall back to the traditional cdn-url patterns
				if hasTemplate {
					pd.Log.Printf("downloading %s", url)

					req, err := http.NewRequest("GET", url, nil)
					if err != nil {
						return fmt.Errorf("could not create new http request: %v", err)
					}
					req.Header.Set("Accept-Encoding", "*")

					resp, err = client.Do(req)
					if err != nil {
						return fmt.Errorf("could not download dist-git file: %v", err)
					}
				}

				// Default cdn-url:  If we don't have a templated download string, try the default <SITE>/<PKG>/<BRANCH>/<HASH> pattern:
				if resp == nil || resp.StatusCode != http.StatusOK {
					url = fmt.Sprintf("%s/%s/%s/%s", pd.CdnUrl, md.Name, branchName, hash)
					pd.Log.Printf("Attempting default URL: %s", url)
					req, err = http.NewRequest("GET", url, nil)
					if err != nil {
						return fmt.Errorf("could not create new http request: %v", err)
					}
					req.Header.Set("Accept-Encoding", "*")
					resp, err = client.Do(req)
					if err != nil {
						return fmt.Errorf("could not download dist-git file: %v", err)
					}
				}

				// If the default URL fails, we have one more pattern to try.  The simple <SITE>/<HASH> pattern
				// If this one fails, we are truly lost, and have to bail out w/ an error:
				if resp == nil || resp.StatusCode != http.StatusOK {
					url = fmt.Sprintf("%s/%s", pd.CdnUrl, hash)
					pd.Log.Printf("Attempting 2nd fallback URL: %s", url)
					req, err = http.NewRequest("GET", url, nil)
					if err != nil {
						return fmt.Errorf("could not create new http request: %v", err)
					}
					req.Header.Set("Accept-Encoding", "*")
					resp, err = client.Do(req)
					if err != nil {
						return fmt.Errorf("could not download dist-git file: %v", err)
					}
					if resp.StatusCode != http.StatusOK {
						return fmt.Errorf("could not download dist-git file (status code %d): %v", resp.StatusCode, err)
					}
				}

				body, err = io.ReadAll(resp.Body)
				if err != nil {
					return fmt.Errorf("could not read the whole dist-git file: %v", err)
				}
				err = resp.Body.Close()
				if err != nil {
					return fmt.Errorf("could not close body handle: %v", err)
				}
			}

			md.BlobCache[hash] = body
		}

		f, err := md.Worktree.Filesystem.Create(path)
		if err != nil {
			return fmt.Errorf("could not open file pointer: %v", err)
		}

		hasher := pd.CompareHash(body, hash)
		if hasher == nil {
			return fmt.Errorf("checksum in metadata does not match dist-git file")
		}

		md.SourcesToIgnore = append(md.SourcesToIgnore, &data.IgnoredSource{
			Name:         path,
			HashFunction: hasher,
		})

		_, err = f.Write(body)
		if err != nil {
			return fmt.Errorf("could not copy dist-git file to in-tree: %v", err)
		}
		_ = f.Close()
	}

	return nil
}

func (g *GitMode) PostProcess(md *data.ModeData) error {
	for _, source := range md.SourcesToIgnore {
		_, err := md.Worktree.Filesystem.Stat(source.Name)
		if err == nil {
			err := md.Worktree.Filesystem.Remove(source.Name)
			if err != nil {
				return fmt.Errorf("could not remove dist-git file: %v", err)
			}
		}
	}

	_, err := md.Worktree.Add(".")
	if err != nil {
		return fmt.Errorf("could not add git sources: %v", err)
	}

	return nil
}

func (g *GitMode) ImportName(pd *data.ProcessData, md *data.ModeData) string {
	if misc.GetTagImportRegex(pd).MatchString(md.TagBranch) {
		match := misc.GetTagImportRegex(pd).FindStringSubmatch(md.TagBranch)
		return match[3]
	}

	return strings.Replace(strings.TrimPrefix(md.TagBranch, "refs/heads/"), "%", "_", -1)
}

// Given a cdnUrl string as input, return same string, but with substituted
// template values ( {{.Name}} , {{.Hash}}, {{.Filename}}, etc. )
func ProcessUrl(cdnUrl string, name string, branch string, hash string, hashtype string, filename string) (string, bool) {
	tmpUrl := Lookaside{name, branch, hash, hashtype, filename}

	// Return cdnUrl as-is if we don't have any templates ("{{ .Variable }}") to process:
	if !(strings.Contains(cdnUrl, "{{") && strings.Contains(cdnUrl, "}}")) {
		return cdnUrl, false
	}

	// If we run into trouble with our template parsing, we'll just return the cdnUrl, exactly as we found it
	tmpl, err := template.New("").Parse(cdnUrl)
	if err != nil {
		return cdnUrl, false
	}

	var result bytes.Buffer
	err = tmpl.Execute(&result, tmpUrl)
	if err != nil {
		log.Fatalf("ERROR: Could not process CDN URL template(s) from URL string: %s\n", cdnUrl)
	}

	return result.String(), true

}